osbm commited on
Commit
442bc35
1 Parent(s): 7baef6e

add tokenizer

Browse files
Files changed (6) hide show
  1. config.json +3 -10
  2. merges.txt +162 -0
  3. special_tokens_map.json +1 -0
  4. tokenizer.json +1 -0
  5. tokenizer_config.json +1 -0
  6. vocab.json +1 -0
config.json CHANGED
@@ -1,7 +1,6 @@
1
  {
2
- "_name_or_path": "data/saved_models/our_90epochs_saved_model",
3
  "architectures": [
4
- "RobertaForSelfiesClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "bos_token_id": 0,
@@ -10,19 +9,13 @@
10
  "hidden_act": "gelu",
11
  "hidden_dropout_prob": 0.1,
12
  "hidden_size": 768,
13
- "id2label": {
14
- "0": "LABEL_0"
15
- },
16
  "initializer_range": 0.02,
17
  "intermediate_size": 3072,
18
- "label2id": {
19
- "LABEL_0": 0
20
- },
21
  "layer_norm_eps": 1e-12,
22
  "max_position_embeddings": 514,
23
  "model_type": "roberta",
24
- "num_attention_heads": 4,
25
- "num_hidden_layers": 12,
26
  "pad_token_id": 1,
27
  "position_embedding_type": "absolute",
28
  "torch_dtype": "float32",
 
1
  {
 
2
  "architectures": [
3
+ "RobertaForMaskedLM"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
 
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
 
 
 
12
  "initializer_range": 0.02,
13
  "intermediate_size": 3072,
 
 
 
14
  "layer_norm_eps": 1e-12,
15
  "max_position_embeddings": 514,
16
  "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 8,
19
  "pad_token_id": 1,
20
  "position_embedding_type": "absolute",
21
  "torch_dtype": "float32",
merges.txt ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2 - Trained by `huggingface/tokenizers`
2
+ B r
3
+ a n
4
+ c h
5
+ Br an
6
+ Bran ch
7
+ Branch 1
8
+ = C
9
+ R i
10
+ n g
11
+ Ri ng
12
+ Ring 1
13
+ = Branch1
14
+ Branch 2
15
+ = O
16
+ Ring 2
17
+ H 1
18
+ C @
19
+ = N
20
+ # Branch1
21
+ C@ @
22
+ = Branch2
23
+ C@ H1
24
+ C@@ H1
25
+ # Branch2
26
+ C l
27
+ # C
28
+ / C
29
+ N H1
30
+ + 1
31
+ - 1
32
+ = Ring1
33
+ O -1
34
+ N +1
35
+ \ C
36
+ / N
37
+ # N
38
+ = Ring2
39
+ = S
40
+ =N +1
41
+ N a
42
+ Na +1
43
+ \ N
44
+ S +1
45
+ / O
46
+ \ S
47
+ \ O
48
+ Br -1
49
+ I -1
50
+ Cl -1
51
+ / C@H1
52
+ Branch 3
53
+ / C@@H1
54
+ = P
55
+ / S
56
+ =N -1
57
+ S i
58
+ K +1
59
+ N -1
60
+ S e
61
+ L i
62
+ Li +1
63
+ + 3
64
+ Cl +3
65
+ \ C@H1
66
+ Ring 3
67
+ \ C@@H1
68
+ / N+1
69
+ / P
70
+ \ F
71
+ P @
72
+ 2 H
73
+ P H1
74
+ / Br
75
+ N @
76
+ P +1
77
+ / Cl
78
+ \ NH1
79
+ \ Br
80
+ @ +1
81
+ / I
82
+ / C@
83
+ T e
84
+ \ N+1
85
+ P@ @
86
+ 1 2
87
+ 5 I
88
+ \ O-1
89
+ 12 5I
90
+ / F
91
+ # N+1
92
+ \ Cl
93
+ N@ +1
94
+ \ I
95
+ - /
96
+ / C@@
97
+ N@ @
98
+ N@ @+1
99
+ -/ Ring2
100
+ - \
101
+ 1 4
102
+ B -1
103
+ C -1
104
+ S @+1
105
+ 14 C
106
+ H 2
107
+ H 4
108
+ I +1
109
+ S -1
110
+ \ P
111
+ =S +1
112
+ =P @
113
+ Si H4
114
+ + 2
115
+ 3 H
116
+ @ @+1
117
+ A g
118
+ C +1
119
+ S @@+1
120
+ Cl +1
121
+ =S e
122
+ -\ Ring1
123
+ H 0
124
+ O H0
125
+ 1 1
126
+ = Branch3
127
+ = Te
128
+ M g
129
+ O +1
130
+ Z n
131
+ \ C@
132
+ \ S+1
133
+ H1 -1
134
+ Se H1
135
+ P@ +1
136
+ -\ Ring2
137
+ 11 C
138
+ =Te +1
139
+ Zn +2
140
+ / NH1
141
+ 1 8
142
+ A s
143
+ B H2
144
+ B H1-1
145
+ C a
146
+ H 3
147
+ O H1-1
148
+ S H2
149
+ =O +1
150
+ Se +1
151
+ Te H2
152
+ 125I H1
153
+ -/ Ring1
154
+ 14C H2
155
+ Ag +1
156
+ =Se +1
157
+ Mg H2
158
+ Mg +2
159
+ 11C H3
160
+ 18 F
161
+ BH2 -1
162
+ Ca +2
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tokenizer.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"<unk>","single_word":false,"lstrip":false,"rstrip":false,"normalized":true},{"id":1,"special":true,"content":"<s>","single_word":false,"lstrip":false,"rstrip":false,"normalized":true},{"id":2,"special":true,"content":"</s>","single_word":false,"lstrip":false,"rstrip":false,"normalized":true},{"id":3,"special":true,"content":"<pad>","single_word":false,"lstrip":false,"rstrip":false,"normalized":true},{"id":4,"special":true,"content":"<mask>","single_word":false,"lstrip":true,"rstrip":false,"normalized":true}],"normalizer":null,"pre_tokenizer":{"type":"ByteLevel","add_prefix_space":false,"trim_offsets":true},"post_processor":{"type":"RobertaProcessing","sep":["</s>",2],"cls":["<s>",1],"trim_offsets":true,"add_prefix_space":false},"decoder":{"type":"ByteLevel","add_prefix_space":true,"trim_offsets":true},"model":{"type":"BPE","dropout":null,"unk_token":null,"continuing_subword_prefix":"","end_of_word_suffix":"","fuse_unk":false,"vocab":{"<unk>":0,"<s>":1,"</s>":2,"<pad>":3,"<mask>":4,"\n":5,"#":6,"+":7,"-":8,".":9,"/":10,"0":11,"1":12,"2":13,"3":14,"4":15,"5":16,"8":17,"=":18,"@":19,"A":20,"B":21,"C":22,"F":23,"H":24,"I":25,"K":26,"L":27,"M":28,"N":29,"O":30,"P":31,"R":32,"S":33,"T":34,"Z":35,"\\":36,"a":37,"c":38,"e":39,"g":40,"h":41,"i":42,"l":43,"n":44,"r":45,"s":46,"Br":47,"an":48,"ch":49,"Bran":50,"Branch":51,"Branch1":52,"=C":53,"Ri":54,"ng":55,"Ring":56,"Ring1":57,"=Branch1":58,"Branch2":59,"=O":60,"Ring2":61,"H1":62,"C@":63,"=N":64,"#Branch1":65,"C@@":66,"=Branch2":67,"C@H1":68,"C@@H1":69,"#Branch2":70,"Cl":71,"#C":72,"/C":73,"NH1":74,"+1":75,"-1":76,"=Ring1":77,"O-1":78,"N+1":79,"\\C":80,"/N":81,"#N":82,"=Ring2":83,"=S":84,"=N+1":85,"Na":86,"Na+1":87,"\\N":88,"S+1":89,"/O":90,"\\S":91,"\\O":92,"Br-1":93,"I-1":94,"Cl-1":95,"/C@H1":96,"Branch3":97,"/C@@H1":98,"=P":99,"/S":100,"=N-1":101,"Si":102,"K+1":103,"N-1":104,"Se":105,"Li":106,"Li+1":107,"+3":108,"Cl+3":109,"\\C@H1":110,"Ring3":111,"\\C@@H1":112,"/N+1":113,"/P":114,"\\F":115,"P@":116,"2H":117,"PH1":118,"/Br":119,"N@":120,"P+1":121,"/Cl":122,"\\NH1":123,"\\Br":124,"@+1":125,"/I":126,"/C@":127,"Te":128,"\\N+1":129,"P@@":130,"12":131,"5I":132,"\\O-1":133,"125I":134,"/F":135,"#N+1":136,"\\Cl":137,"N@+1":138,"\\I":139,"-/":140,"/C@@":141,"N@@":142,"N@@+1":143,"-/Ring2":144,"-\\":145,"14":146,"B-1":147,"C-1":148,"S@+1":149,"14C":150,"H2":151,"H4":152,"I+1":153,"S-1":154,"\\P":155,"=S+1":156,"=P@":157,"SiH4":158,"+2":159,"3H":160,"@@+1":161,"Ag":162,"C+1":163,"S@@+1":164,"Cl+1":165,"=Se":166,"-\\Ring1":167,"H0":168,"OH0":169,"11":170,"=Branch3":171,"=Te":172,"Mg":173,"O+1":174,"Zn":175,"\\C@":176,"\\S+1":177,"H1-1":178,"SeH1":179,"P@+1":180,"-\\Ring2":181,"11C":182,"=Te+1":183,"Zn+2":184,"/NH1":185,"18":186,"As":187,"BH2":188,"BH1-1":189,"Ca":190,"H3":191,"OH1-1":192,"SH2":193,"=O+1":194,"Se+1":195,"TeH2":196,"125IH1":197,"-/Ring1":198,"14CH2":199,"Ag+1":200,"=Se+1":201,"MgH2":202,"Mg+2":203,"11CH3":204,"18F":205,"BH2-1":206,"Ca+2":207},"merges":["B r","a n","c h","Br an","Bran ch","Branch 1","= C","R i","n g","Ri ng","Ring 1","= Branch1","Branch 2","= O","Ring 2","H 1","C @","= N","# Branch1","C@ @","= Branch2","C@ H1","C@@ H1","# Branch2","C l","# C","/ C","N H1","+ 1","- 1","= Ring1","O -1","N +1","\\ C","/ N","# N","= Ring2","= S","=N +1","N a","Na +1","\\ N","S +1","/ O","\\ S","\\ O","Br -1","I -1","Cl -1","/ C@H1","Branch 3","/ C@@H1","= P","/ S","=N -1","S i","K +1","N -1","S e","L i","Li +1","+ 3","Cl +3","\\ C@H1","Ring 3","\\ C@@H1","/ N+1","/ P","\\ F","P @","2 H","P H1","/ Br","N @","P +1","/ Cl","\\ NH1","\\ Br","@ +1","/ I","/ C@","T e","\\ N+1","P@ @","1 2","5 I","\\ O-1","12 5I","/ F","# N+1","\\ Cl","N@ +1","\\ I","- /","/ C@@","N@ @","N@ @+1","-/ Ring2","- \\","1 4","B -1","C -1","S @+1","14 C","H 2","H 4","I +1","S -1","\\ P","=S +1","=P @","Si H4","+ 2","3 H","@ @+1","A g","C +1","S @@+1","Cl +1","=S e","-\\ Ring1","H 0","O H0","1 1","= Branch3","= Te","M g","O +1","Z n","\\ C@","\\ S+1","H1 -1","Se H1","P@ +1","-\\ Ring2","11 C","=Te +1","Zn +2","/ NH1","1 8","A s","B H2","B H1-1","C a","H 3","O H1-1","S H2","=O +1","Se +1","Te H2","125I H1","-/ Ring1","14C H2","Ag +1","=Se +1","Mg H2","Mg +2","11C H3","18 F","BH2 -1","Ca +2"]}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "special_tokens_map_file": null, "name_or_path": "./data/bpe/", "tokenizer_class": "RobertaTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<unk>":0,"<s>":1,"</s>":2,"<pad>":3,"<mask>":4,"\n":5,"#":6,"+":7,"-":8,".":9,"/":10,"0":11,"1":12,"2":13,"3":14,"4":15,"5":16,"8":17,"=":18,"@":19,"A":20,"B":21,"C":22,"F":23,"H":24,"I":25,"K":26,"L":27,"M":28,"N":29,"O":30,"P":31,"R":32,"S":33,"T":34,"Z":35,"\\":36,"a":37,"c":38,"e":39,"g":40,"h":41,"i":42,"l":43,"n":44,"r":45,"s":46,"Br":47,"an":48,"ch":49,"Bran":50,"Branch":51,"Branch1":52,"=C":53,"Ri":54,"ng":55,"Ring":56,"Ring1":57,"=Branch1":58,"Branch2":59,"=O":60,"Ring2":61,"H1":62,"C@":63,"=N":64,"#Branch1":65,"C@@":66,"=Branch2":67,"C@H1":68,"C@@H1":69,"#Branch2":70,"Cl":71,"#C":72,"/C":73,"NH1":74,"+1":75,"-1":76,"=Ring1":77,"O-1":78,"N+1":79,"\\C":80,"/N":81,"#N":82,"=Ring2":83,"=S":84,"=N+1":85,"Na":86,"Na+1":87,"\\N":88,"S+1":89,"/O":90,"\\S":91,"\\O":92,"Br-1":93,"I-1":94,"Cl-1":95,"/C@H1":96,"Branch3":97,"/C@@H1":98,"=P":99,"/S":100,"=N-1":101,"Si":102,"K+1":103,"N-1":104,"Se":105,"Li":106,"Li+1":107,"+3":108,"Cl+3":109,"\\C@H1":110,"Ring3":111,"\\C@@H1":112,"/N+1":113,"/P":114,"\\F":115,"P@":116,"2H":117,"PH1":118,"/Br":119,"N@":120,"P+1":121,"/Cl":122,"\\NH1":123,"\\Br":124,"@+1":125,"/I":126,"/C@":127,"Te":128,"\\N+1":129,"P@@":130,"12":131,"5I":132,"\\O-1":133,"125I":134,"/F":135,"#N+1":136,"\\Cl":137,"N@+1":138,"\\I":139,"-/":140,"/C@@":141,"N@@":142,"N@@+1":143,"-/Ring2":144,"-\\":145,"14":146,"B-1":147,"C-1":148,"S@+1":149,"14C":150,"H2":151,"H4":152,"I+1":153,"S-1":154,"\\P":155,"=S+1":156,"=P@":157,"SiH4":158,"+2":159,"3H":160,"@@+1":161,"Ag":162,"C+1":163,"S@@+1":164,"Cl+1":165,"=Se":166,"-\\Ring1":167,"H0":168,"OH0":169,"11":170,"=Branch3":171,"=Te":172,"Mg":173,"O+1":174,"Zn":175,"\\C@":176,"\\S+1":177,"H1-1":178,"SeH1":179,"P@+1":180,"-\\Ring2":181,"11C":182,"=Te+1":183,"Zn+2":184,"/NH1":185,"18":186,"As":187,"BH2":188,"BH1-1":189,"Ca":190,"H3":191,"OH1-1":192,"SH2":193,"=O+1":194,"Se+1":195,"TeH2":196,"125IH1":197,"-/Ring1":198,"14CH2":199,"Ag+1":200,"=Se+1":201,"MgH2":202,"Mg+2":203,"11CH3":204,"18F":205,"BH2-1":206,"Ca+2":207}