shoffman commited on
Commit
c551b8a
1 Parent(s): 44a1a31

Upload custom code for molformer models

Browse files
config.json CHANGED
@@ -2,6 +2,10 @@
2
  "architectures": [
3
  "MolformerForMaskedLM"
4
  ],
 
 
 
 
5
  "classifier_dropout_prob": null,
6
  "classifier_skip_connection": true,
7
  "deterministic_eval": false,
 
2
  "architectures": [
3
  "MolformerForMaskedLM"
4
  ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_molformer.MolformerConfig",
7
+ "AutoModelForMaskedLM": "modeling_molformer.MolformerForMaskedLM"
8
+ },
9
  "classifier_dropout_prob": null,
10
  "classifier_skip_connection": true,
11
  "deterministic_eval": false,
configuration_molformer.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Molformer model configuration"""
16
+
17
+ from collections import OrderedDict
18
+ from typing import Mapping
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.onnx import OnnxConfig
22
+ from transformers.utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ MOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28
+ "ibm/MoLFormer-XL-both-10pct": "https://huggingface.co/ibm/MoLFormer-XL-both-10pct/resolve/main/config.json",
29
+ }
30
+
31
+
32
+ class MolformerConfig(PretrainedConfig):
33
+ r"""
34
+ This is the configuration class to store the configuration of a [`MolformerModel`]. It is used to instantiate an
35
+ Molformer model according to the specified arguments, defining the model architecture. Instantiating a
36
+ configuration with the defaults will yield a similar configuration to that of the Molformer
37
+ [ibm/MoLFormer-XL-both-10pct](https://huggingface.co/ibm/MoLFormer-XL-both-10pct) architecture.
38
+
39
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
40
+ documentation from [`PretrainedConfig`] for more information.
41
+
42
+
43
+ Args:
44
+ vocab_size (`int`, *optional*, defaults to 2362):
45
+ Vocabulary size of the Molformer model. Defines the number of different tokens that can be represented by
46
+ the `inputs_ids` passed when calling [`MolformerModel`] or [`TFMolformerModel`].
47
+ hidden_size (`int`, *optional*, defaults to 768):
48
+ Dimension of the encoder layers and the pooler layer.
49
+ num_hidden_layers (`int`, *optional*, defaults to 12):
50
+ Number of hidden layers in the Transformer encoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 12):
52
+ Number of attention heads for each attention layer in the Transformer encoder.
53
+ intermediate_size (`int`, *optional*, defaults to 768):
54
+ Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
55
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
56
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
57
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
58
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
59
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
60
+ embedding_dropout_prob (`float`, *optional*, defaults to 0.2):
61
+ The dropout probability for the word embeddings.
62
+ max_position_embeddings (`int`, *optional*, defaults to 202):
63
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
64
+ just in case (e.g., 512 or 1024 or 1536).
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
68
+ The epsilon used by the layer normalization layers.
69
+ linear_attention_eps (`float`, *optional*, defaults to 1e-06):
70
+ The epsilon used by the linear attention layers normalization step.
71
+ num_random_features (`int`, *optional*, defaults to 32):
72
+ Random feature map dimension used in linear attention.
73
+ feature_map_kernel (`str` or `function`, *optional*, defaults to `"relu"`):
74
+ The non-linear activation function (function or string) in the generalized random features. If string,
75
+ `"gelu"`, `"relu"`, `"selu"`, and `"gelu_new"` ar supported.
76
+ deterministic_eval (`bool`, *optional*, defaults to `False`):
77
+ Whether the random features should only be redrawn when training or not. If `True` and `model.training` is
78
+ `False`, linear attention random feature weights will be constant, i.e., deterministic.
79
+ classifier_dropout_prob (`float`, *optional*):
80
+ The dropout probability for the classification head. If `None`, use `hidden_dropout_prob`.
81
+ classifier_skip_connection (`bool`, *optional*, defaults to `True`):
82
+ Whether a skip connection should be made between the layers of the classification head or not.
83
+ pad_token_id (`int`, *optional*, defaults to 2):
84
+ The id of the _padding_ token.
85
+
86
+ Example:
87
+
88
+ ```python
89
+ >>> from transformers import MolformerModel, MolformerConfig
90
+
91
+ >>> # Initializing a Molformer ibm/MoLFormer-XL-both-10pct style configuration
92
+ >>> configuration = MolformerConfig()
93
+
94
+ >>> # Initializing a model from the ibm/MoLFormer-XL-both-10pct style configuration
95
+ >>> model = MolformerModel(configuration)
96
+
97
+ >>> # Accessing the model configuration
98
+ >>> configuration = model.config
99
+ ```"""
100
+ model_type = "molformer"
101
+
102
+ def __init__(
103
+ self,
104
+ vocab_size=2362,
105
+ hidden_size=768,
106
+ num_hidden_layers=12,
107
+ num_attention_heads=12,
108
+ intermediate_size=768,
109
+ hidden_act="gelu",
110
+ hidden_dropout_prob=0.1,
111
+ embedding_dropout_prob=0.2,
112
+ max_position_embeddings=202,
113
+ initializer_range=0.02,
114
+ layer_norm_eps=1e-12,
115
+ linear_attention_eps=1e-6,
116
+ num_random_features=32,
117
+ feature_map_kernel="relu",
118
+ deterministic_eval=False,
119
+ classifier_dropout_prob=None,
120
+ classifier_skip_connection=True,
121
+ pad_token_id=2,
122
+ **kwargs,
123
+ ):
124
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
125
+
126
+ self.vocab_size = vocab_size
127
+ self.hidden_size = hidden_size
128
+ self.num_hidden_layers = num_hidden_layers
129
+ self.num_attention_heads = num_attention_heads
130
+ self.hidden_act = hidden_act
131
+ self.intermediate_size = intermediate_size
132
+ self.hidden_dropout_prob = hidden_dropout_prob
133
+ self.embedding_dropout_prob = embedding_dropout_prob
134
+ self.max_position_embeddings = max_position_embeddings
135
+ self.initializer_range = initializer_range
136
+ self.layer_norm_eps = layer_norm_eps
137
+ self.linear_attention_eps = linear_attention_eps
138
+ self.num_random_features = num_random_features
139
+ self.feature_map_kernel = feature_map_kernel
140
+ self.deterministic_eval = deterministic_eval
141
+ self.classifier_dropout_prob = classifier_dropout_prob
142
+ self.classifier_skip_connection = classifier_skip_connection
143
+
144
+
145
+ # Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->Molformer
146
+ class MolformerOnnxConfig(OnnxConfig):
147
+ @property
148
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
149
+ if self.task == "multiple-choice":
150
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
151
+ else:
152
+ dynamic_axis = {0: "batch", 1: "sequence"}
153
+ return OrderedDict(
154
+ [
155
+ ("input_ids", dynamic_axis),
156
+ ("attention_mask", dynamic_axis),
157
+ ]
158
+ )
convert_molformer_original_checkpoint_to_pytorch.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Convert Molformer checkpoint."""
16
+
17
+
18
+ import argparse
19
+ import re
20
+
21
+ import torch
22
+
23
+ from transformers import MolformerConfig, MolformerForMaskedLM
24
+ from transformers.utils import logging
25
+
26
+
27
+ logging.set_verbosity_info()
28
+
29
+ RULES = [
30
+ (r"tok_emb", r"molformer.embeddings.word_embeddings"),
31
+ (
32
+ r"blocks\.layers\.(\d+)\.attention\.inner_attention\.feature_map\.omega",
33
+ r"molformer.encoder.layer.\1.attention.self.feature_map.weight",
34
+ ),
35
+ (
36
+ r"blocks\.layers\.(\d+)\.attention\.(query|key|value)_projection",
37
+ r"molformer.encoder.layer.\1.attention.self.\2",
38
+ ),
39
+ (r"blocks\.layers\.(\d+)\.attention\.out_projection", r"molformer.encoder.layer.\1.attention.output.dense"),
40
+ (r"blocks\.layers\.(\d+)\.norm1", r"molformer.encoder.layer.\1.attention.output.LayerNorm"),
41
+ (r"blocks\.layers\.(\d+)\.linear1", r"molformer.encoder.layer.\1.intermediate.dense"),
42
+ (r"blocks\.layers\.(\d+)\.linear2", r"molformer.encoder.layer.\1.output.dense"),
43
+ (r"blocks\.layers\.(\d+)\.norm2", r"molformer.encoder.layer.\1.output.LayerNorm"),
44
+ (r"blocks\.norm", r"molformer.LayerNorm"),
45
+ (r"lang_model\.embed", r"lm_head.transform.dense"),
46
+ (r"lang_model\.ln_f", r"lm_head.transform.LayerNorm"),
47
+ (r"lang_model\.head", r"lm_head.decoder"),
48
+ ]
49
+ for i, (find, replace) in enumerate(RULES):
50
+ RULES[i] = (re.compile(find), replace)
51
+
52
+
53
+ def convert_lightning_checkpoint_to_pytorch(lightning_checkpoint_path, pytorch_dump_path, config=None):
54
+ # Initialise PyTorch model
55
+ config = MolformerConfig(tie_word_embeddings=False) if config is None else MolformerConfig.from_pretrained(config)
56
+ print(f"Building PyTorch model from configuration: {config}")
57
+ model = MolformerForMaskedLM(config)
58
+
59
+ # Load weights from lightning checkpoint
60
+ checkpoint = torch.load(lightning_checkpoint_path, map_location="cpu")
61
+
62
+ state_dict = checkpoint["state_dict"]
63
+ new_state_dict = {}
64
+ for key, val in state_dict.items():
65
+ for find, replace in RULES:
66
+ if find.search(key) is not None:
67
+ new_state_dict[find.sub(replace, key)] = val
68
+ break
69
+ model.load_state_dict(new_state_dict)
70
+
71
+ # Save pytorch-model
72
+ print(f"Save PyTorch model to {pytorch_dump_path}")
73
+ torch.save(model.state_dict(), pytorch_dump_path)
74
+
75
+
76
+ if __name__ == "__main__":
77
+ parser = argparse.ArgumentParser()
78
+ # Required parameters
79
+ parser.add_argument(
80
+ "--lightning_checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint path."
81
+ )
82
+ parser.add_argument(
83
+ "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
84
+ )
85
+ parser.add_argument("--config", default=None, type=str, help="Path to config.json")
86
+ args = parser.parse_args()
87
+ convert_lightning_checkpoint_to_pytorch(args.lightning_checkpoint_path, args.pytorch_dump_path, config=args.config)
modeling_molformer.py ADDED
@@ -0,0 +1,921 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ PyTorch Molformer model."""
16
+
17
+
18
+ import math
19
+ from typing import Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.utils.checkpoint
23
+ from torch import nn
24
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
+
26
+ from transformers.activations import ACT2FN
27
+ from transformers.modeling_outputs import (
28
+ BaseModelOutput,
29
+ BaseModelOutputWithPooling,
30
+ MaskedLMOutput,
31
+ SequenceClassifierOutput,
32
+ )
33
+ from transformers.modeling_utils import PreTrainedModel
34
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
35
+ from transformers.utils import (
36
+ add_code_sample_docstrings,
37
+ add_start_docstrings,
38
+ add_start_docstrings_to_model_forward,
39
+ logging,
40
+ )
41
+ from .configuration_molformer import MolformerConfig
42
+
43
+
44
+ logger = logging.get_logger(__name__)
45
+
46
+ _CHECKPOINT_FOR_DOC = "ibm/MoLFormer-XL-both-10pct"
47
+ _CONFIG_FOR_DOC = "MolformerConfig"
48
+
49
+ MOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
50
+ "ibm/MoLFormer-XL-both-10pct",
51
+ # See all MoLFormer models at https://huggingface.co/models?filter=molformer
52
+ ]
53
+
54
+
55
+ # Copied from transformers.models.esm.modeling_esm.rotate_half
56
+ def rotate_half(x):
57
+ x1, x2 = x.chunk(2, dim=-1)
58
+ return torch.cat((-x2, x1), dim=-1)
59
+
60
+
61
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
62
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
63
+ cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
64
+ sin = sin[position_ids].unsqueeze(1)
65
+ q_embed = (q * cos) + (rotate_half(q) * sin)
66
+ k_embed = (k * cos) + (rotate_half(k) * sin)
67
+ return q_embed, k_embed
68
+
69
+
70
+ # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Molformer
71
+ class MolformerRotaryEmbedding(nn.Module):
72
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
73
+ super().__init__()
74
+
75
+ self.dim = dim
76
+ self.max_position_embeddings = max_position_embeddings
77
+ self.base = base
78
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
79
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
80
+
81
+ # Build here to make `torch.jit.trace` work.
82
+ self._set_cos_sin_cache(
83
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
84
+ )
85
+
86
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
87
+ self.max_seq_len_cached = seq_len
88
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
89
+
90
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
91
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
92
+ emb = torch.cat((freqs, freqs), dim=-1)
93
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
94
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
95
+
96
+ def forward(self, x, seq_len=None):
97
+ # x: [bs, num_attention_heads, seq_len, head_size]
98
+ if seq_len > self.max_seq_len_cached:
99
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
100
+
101
+ return (
102
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
103
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
104
+ )
105
+
106
+
107
+ class MolformerEmbeddings(nn.Module):
108
+ """Construct the embeddings from word embeddings."""
109
+
110
+ def __init__(self, config):
111
+ super().__init__()
112
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
113
+ self.dropout = nn.Dropout(config.embedding_dropout_prob)
114
+
115
+ def forward(
116
+ self, input_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None
117
+ ) -> torch.Tensor:
118
+ if inputs_embeds is None:
119
+ inputs_embeds = self.word_embeddings(input_ids)
120
+
121
+ embeddings = inputs_embeds
122
+ embeddings = self.dropout(embeddings)
123
+ return embeddings
124
+
125
+
126
+ class MolformerFeatureMap(nn.Module):
127
+ def __init__(self, config):
128
+ super().__init__()
129
+ self.query_size = config.hidden_size // config.num_attention_heads
130
+ self.num_components = config.num_random_features
131
+ self.orthogonal_random_weights()
132
+ if isinstance(config.feature_map_kernel, str):
133
+ self.kernel = ACT2FN[config.feature_map_kernel]
134
+ else:
135
+ self.kernel = config.feature_map_kernel
136
+ self.deterministic = config.deterministic_eval
137
+
138
+ def orthogonal_random_weights(self, device=None):
139
+ # make sure query size evenly divides feature size (round up)
140
+ num_batches = math.ceil(self.num_components / self.query_size)
141
+
142
+ def orthogonal_batch(size):
143
+ block = torch.randn(size, size, device=device)
144
+ norms = torch.linalg.norm(block, dim=1).unsqueeze(0)
145
+ Q, _ = torch.linalg.qr(block)
146
+ return Q * norms
147
+
148
+ random_weights = torch.cat([orthogonal_batch(self.query_size) for _ in range(num_batches)], dim=1)
149
+ random_weights = random_weights[:, : self.num_components]
150
+ self.register_buffer("weight", random_weights)
151
+
152
+ def forward(self, query, key):
153
+ if not self.deterministic or self.training:
154
+ self.orthogonal_random_weights(query.device)
155
+ # generalized random fourier features
156
+ query = torch.matmul(query, self.weight)
157
+ key = torch.matmul(key, self.weight)
158
+ return self.kernel(query), self.kernel(key)
159
+
160
+
161
+ class MolformerSelfAttention(nn.Module):
162
+ def __init__(self, config):
163
+ super().__init__()
164
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
165
+ raise ValueError(
166
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
167
+ f"heads ({config.num_attention_heads})"
168
+ )
169
+
170
+ self.num_attention_heads = config.num_attention_heads
171
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
172
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
173
+
174
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
175
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
176
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
177
+
178
+ self.eps = config.linear_attention_eps
179
+
180
+ self.rotary_embeddings = MolformerRotaryEmbedding(
181
+ dim=self.attention_head_size, max_position_embeddings=config.max_position_embeddings
182
+ )
183
+ self.feature_map = MolformerFeatureMap(config)
184
+
185
+ # Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores
186
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
187
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
188
+ x = x.view(new_x_shape)
189
+ return x.permute(0, 2, 1, 3)
190
+
191
+ def forward(
192
+ self,
193
+ hidden_states: torch.Tensor,
194
+ attention_mask: Optional[torch.FloatTensor] = None,
195
+ position_ids: Optional[torch.LongTensor] = None,
196
+ head_mask: Optional[torch.FloatTensor] = None,
197
+ output_attentions: Optional[bool] = False,
198
+ ) -> Tuple[torch.Tensor]:
199
+ query_layer = self.transpose_for_scores(self.query(hidden_states))
200
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
201
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
202
+
203
+ kv_seq_len = key_layer.shape[-2]
204
+ cos, sin = self.rotary_embeddings(value_layer, seq_len=kv_seq_len)
205
+ query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin, position_ids)
206
+ # Apply the feature map to the queries and keys
207
+ query_layer, key_layer = self.feature_map(query_layer, key_layer)
208
+
209
+ if attention_mask is not None:
210
+ # since we don't use softmax, we need to reconvert this mask to 1/0
211
+ attention_mask = (attention_mask == 0).to(attention_mask.dtype)
212
+ # separate original mask from causal mask
213
+ per_query_attn = attention_mask[:, 0, -1]
214
+ per_query_extended = per_query_attn[:, None, None, :]
215
+ if not torch.equal(attention_mask, per_query_extended):
216
+ raise ValueError(
217
+ "MolformerSelfAttention does not support arbitrary 3D attention. attention_mask must be 2D (i.e., [batch size, sequence length])"
218
+ )
219
+
220
+ key_layer = key_layer * per_query_attn[:, None, -kv_seq_len:, None]
221
+
222
+ # linear attention
223
+ key_value = torch.matmul(key_layer.transpose(-1, -2), value_layer)
224
+ norm = torch.matmul(query_layer, key_layer.sum(dim=-2).unsqueeze(-1)).clamp(min=self.eps)
225
+ context_layer = torch.matmul(query_layer, key_value) / norm
226
+
227
+ if head_mask is not None:
228
+ context_layer = context_layer * head_mask
229
+
230
+ if output_attentions:
231
+ logger.warning(
232
+ "Outputting attentions in linear attention negates the efficiency gains! Only use for visualization/debugging."
233
+ )
234
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
235
+ if attention_mask is not None:
236
+ attention_scores = attention_scores * attention_mask
237
+ attention_probs = nn.functional.normalize(attention_scores, p=1, dim=-1, eps=self.eps)
238
+ if head_mask is not None:
239
+ attention_probs = attention_probs * head_mask
240
+ # recompute context_layer for grad
241
+ context_layer = torch.matmul(attention_probs, value_layer)
242
+
243
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
244
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
245
+ context_layer = context_layer.view(*new_context_layer_shape)
246
+
247
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
248
+
249
+ return outputs
250
+
251
+
252
+ # Copied from transformers.models.bert.modeling_bert.BertSelfOutput
253
+ class MolformerSelfOutput(nn.Module):
254
+ def __init__(self, config):
255
+ super().__init__()
256
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
257
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
258
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
259
+
260
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
261
+ hidden_states = self.dense(hidden_states)
262
+ hidden_states = self.dropout(hidden_states)
263
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
264
+ return hidden_states
265
+
266
+
267
+ class MolformerAttention(nn.Module):
268
+ def __init__(self, config):
269
+ super().__init__()
270
+ self.self = MolformerSelfAttention(config)
271
+ self.output = MolformerSelfOutput(config)
272
+ self.pruned_heads = set()
273
+
274
+ # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
275
+ def prune_heads(self, heads):
276
+ if len(heads) == 0:
277
+ return
278
+ heads, index = find_pruneable_heads_and_indices(
279
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
280
+ )
281
+
282
+ # Prune linear layers
283
+ self.self.query = prune_linear_layer(self.self.query, index)
284
+ self.self.key = prune_linear_layer(self.self.key, index)
285
+ self.self.value = prune_linear_layer(self.self.value, index)
286
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
287
+
288
+ # Update hyper params and store pruned heads
289
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
290
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
291
+ self.pruned_heads = self.pruned_heads.union(heads)
292
+
293
+ def forward(
294
+ self,
295
+ hidden_states: torch.Tensor,
296
+ attention_mask: Optional[torch.FloatTensor] = None,
297
+ position_ids: Optional[torch.LongTensor] = None,
298
+ head_mask: Optional[torch.FloatTensor] = None,
299
+ output_attentions: Optional[bool] = False,
300
+ ) -> Tuple[torch.Tensor]:
301
+ self_outputs = self.self(
302
+ hidden_states,
303
+ attention_mask,
304
+ position_ids,
305
+ head_mask,
306
+ output_attentions,
307
+ )
308
+ attention_output = self.output(self_outputs[0], hidden_states)
309
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
310
+ return outputs
311
+
312
+
313
+ # Copied from transformers.models.bert.modeling_bert.BertIntermediate
314
+ class MolformerIntermediate(nn.Module):
315
+ def __init__(self, config):
316
+ super().__init__()
317
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
318
+ if isinstance(config.hidden_act, str):
319
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
320
+ else:
321
+ self.intermediate_act_fn = config.hidden_act
322
+
323
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
324
+ hidden_states = self.dense(hidden_states)
325
+ hidden_states = self.intermediate_act_fn(hidden_states)
326
+ return hidden_states
327
+
328
+
329
+ # Copied from transformers.models.bert.modeling_bert.BertOutput
330
+ class MolformerOutput(nn.Module):
331
+ def __init__(self, config):
332
+ super().__init__()
333
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
334
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
335
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
336
+
337
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
338
+ hidden_states = self.dense(hidden_states)
339
+ hidden_states = self.dropout(hidden_states)
340
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
341
+ return hidden_states
342
+
343
+
344
+ class MolformerLayer(nn.Module):
345
+ def __init__(self, config):
346
+ super().__init__()
347
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
348
+ self.seq_len_dim = 1
349
+ self.attention = MolformerAttention(config)
350
+ self.intermediate = MolformerIntermediate(config)
351
+ self.output = MolformerOutput(config)
352
+
353
+ def forward(
354
+ self,
355
+ hidden_states: torch.Tensor,
356
+ attention_mask: Optional[torch.FloatTensor] = None,
357
+ position_ids: Optional[torch.LongTensor] = None,
358
+ head_mask: Optional[torch.FloatTensor] = None,
359
+ output_attentions: Optional[bool] = False,
360
+ ) -> Tuple[torch.Tensor]:
361
+ self_attention_outputs = self.attention(
362
+ hidden_states,
363
+ attention_mask,
364
+ position_ids,
365
+ head_mask,
366
+ output_attentions=output_attentions,
367
+ )
368
+ attention_output = self_attention_outputs[0]
369
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
370
+
371
+ layer_output = apply_chunking_to_forward(
372
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
373
+ )
374
+ outputs = (layer_output,) + outputs
375
+
376
+ return outputs
377
+
378
+ def feed_forward_chunk(self, attention_output):
379
+ intermediate_output = self.intermediate(attention_output)
380
+ layer_output = self.output(intermediate_output, attention_output)
381
+ return layer_output
382
+
383
+
384
+ class MolformerEncoder(nn.Module):
385
+ def __init__(self, config):
386
+ super().__init__()
387
+ self.config = config
388
+ self.layer = nn.ModuleList([MolformerLayer(config) for _ in range(config.num_hidden_layers)])
389
+ self.gradient_checkpointing = False
390
+
391
+ def forward(
392
+ self,
393
+ hidden_states: torch.Tensor,
394
+ attention_mask: Optional[torch.FloatTensor] = None,
395
+ position_ids: Optional[torch.LongTensor] = None,
396
+ head_mask: Optional[torch.FloatTensor] = None,
397
+ output_attentions: Optional[bool] = False,
398
+ output_hidden_states: Optional[bool] = False,
399
+ return_dict: Optional[bool] = True,
400
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
401
+ all_hidden_states = () if output_hidden_states else None
402
+ all_self_attentions = () if output_attentions else None
403
+
404
+ for i, layer_module in enumerate(self.layer):
405
+ if output_hidden_states:
406
+ all_hidden_states = all_hidden_states + (hidden_states,)
407
+
408
+ layer_head_mask = head_mask[i] if head_mask is not None else None
409
+
410
+ if self.gradient_checkpointing and self.training:
411
+
412
+ def create_custom_forward(module):
413
+ def custom_forward(*inputs):
414
+ return module(*inputs, output_attentions)
415
+
416
+ return custom_forward
417
+
418
+ layer_outputs = torch.utils.checkpoint.checkpoint(
419
+ create_custom_forward(layer_module),
420
+ hidden_states,
421
+ attention_mask,
422
+ position_ids,
423
+ layer_head_mask,
424
+ )
425
+ else:
426
+ layer_outputs = layer_module(
427
+ hidden_states,
428
+ attention_mask,
429
+ position_ids,
430
+ layer_head_mask,
431
+ output_attentions,
432
+ )
433
+
434
+ hidden_states = layer_outputs[0]
435
+ if output_attentions:
436
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
437
+
438
+ if output_hidden_states:
439
+ all_hidden_states = all_hidden_states + (hidden_states,)
440
+
441
+ if not return_dict:
442
+ return tuple(
443
+ v
444
+ for v in [
445
+ hidden_states,
446
+ all_hidden_states,
447
+ all_self_attentions,
448
+ ]
449
+ if v is not None
450
+ )
451
+ return BaseModelOutput(
452
+ last_hidden_state=hidden_states,
453
+ hidden_states=all_hidden_states,
454
+ attentions=all_self_attentions,
455
+ )
456
+
457
+
458
+ # Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform
459
+ class MolformerPredictionHeadTransform(nn.Module):
460
+ def __init__(self, config):
461
+ super().__init__()
462
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
463
+ if isinstance(config.hidden_act, str):
464
+ self.transform_act_fn = ACT2FN[config.hidden_act]
465
+ else:
466
+ self.transform_act_fn = config.hidden_act
467
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
468
+
469
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
470
+ hidden_states = self.dense(hidden_states)
471
+ hidden_states = self.transform_act_fn(hidden_states)
472
+ hidden_states = self.LayerNorm(hidden_states)
473
+ return hidden_states
474
+
475
+
476
+ class MolformerLMPredictionHead(nn.Module):
477
+ def __init__(self, config):
478
+ super().__init__()
479
+ self.transform = MolformerPredictionHeadTransform(config)
480
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
481
+
482
+ def forward(self, hidden_states):
483
+ hidden_states = self.transform(hidden_states)
484
+ hidden_states = self.decoder(hidden_states)
485
+ return hidden_states
486
+
487
+
488
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Molformer,roberta->molformer
489
+ class MolformerPreTrainedModel(PreTrainedModel):
490
+ """
491
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
492
+ models.
493
+ """
494
+
495
+ config_class = MolformerConfig
496
+ base_model_prefix = "molformer"
497
+ supports_gradient_checkpointing = True
498
+ _no_split_modules = ["MolformerEmbeddings", "MolformerSelfAttention"]
499
+
500
+ # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
501
+ def _init_weights(self, module):
502
+ """Initialize the weights"""
503
+ if isinstance(module, nn.Linear):
504
+ # Slightly different from the TF version which uses truncated_normal for initialization
505
+ # cf https://github.com/pytorch/pytorch/pull/5617
506
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
507
+ if module.bias is not None:
508
+ module.bias.data.zero_()
509
+ elif isinstance(module, nn.Embedding):
510
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
511
+ if module.padding_idx is not None:
512
+ module.weight.data[module.padding_idx].zero_()
513
+ elif isinstance(module, nn.LayerNorm):
514
+ module.bias.data.zero_()
515
+ module.weight.data.fill_(1.0)
516
+
517
+ def _set_gradient_checkpointing(self, module, value=False):
518
+ if isinstance(module, MolformerEncoder):
519
+ module.gradient_checkpointing = value
520
+
521
+
522
+ def masked_avg_pool1d(hidden_states, attention_mask, eps=1e-9):
523
+ attention_mask = attention_mask.unsqueeze(-1).expand_as(hidden_states).float()
524
+ sum_embeddings = torch.sum(hidden_states * attention_mask, dim=1)
525
+ sum_mask = torch.clamp(attention_mask.sum(dim=1), min=eps)
526
+ embedding = sum_embeddings / sum_mask
527
+ return embedding
528
+
529
+
530
+ MOLFORMER_START_DOCSTRING = r"""
531
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
532
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
533
+ behavior.
534
+
535
+ Parameters:
536
+ config ([`MolformerConfig`]): Model configuration class with all the parameters of the model.
537
+ Initializing with a config file does not load the weights associated with the model, only the
538
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
539
+ """
540
+
541
+ MOLFORMER_INPUTS_DOCSTRING = r"""
542
+ Args:
543
+ input_ids (`torch.LongTensor` of shape `({0})`):
544
+ Indices of input sequence tokens in the vocabulary.
545
+
546
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
547
+ [`PreTrainedTokenizer.__call__`] for details.
548
+
549
+ [What are input IDs?](../glossary#input-ids)
550
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
551
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
552
+
553
+ - 1 for tokens that are **not masked**,
554
+ - 0 for tokens that are **masked**.
555
+
556
+ [What are attention masks?](../glossary#attention-mask)
557
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
558
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
559
+ config.n_positions - 1]`.
560
+
561
+ [What are position IDs?](../glossary#position-ids)
562
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
563
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
564
+
565
+ - 1 indicates the head is **not masked**,
566
+ - 0 indicates the head is **masked**.
567
+
568
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
569
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
570
+ is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
571
+ model's internal embedding lookup matrix.
572
+ output_attentions (`bool`, *optional*):
573
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
574
+ tensors for more detail.
575
+ output_hidden_states (`bool`, *optional*):
576
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
577
+ more detail.
578
+ return_dict (`bool`, *optional*):
579
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
580
+ """
581
+
582
+
583
+ @add_start_docstrings(
584
+ "The bare Molformer Model transformer outputting raw hidden-states without any specific head on top.",
585
+ MOLFORMER_START_DOCSTRING,
586
+ """
587
+ add_pooling_layer (`bool`, *optional*, defaults to `True`):
588
+ Whether or not to apply pooling layer.
589
+ """,
590
+ )
591
+ class MolformerModel(MolformerPreTrainedModel):
592
+ """
593
+
594
+ The model can behave as an encoder (with only self-attention).
595
+ """
596
+
597
+ def __init__(self, config, add_pooling_layer=True):
598
+ super().__init__(config)
599
+ self.config = config
600
+
601
+ self.embeddings = MolformerEmbeddings(config)
602
+ self.encoder = MolformerEncoder(config)
603
+
604
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
605
+ self.pooler = masked_avg_pool1d if add_pooling_layer else None
606
+
607
+ # Initialize weights and apply final processing
608
+ self.post_init()
609
+
610
+ def get_input_embeddings(self):
611
+ return self.embeddings.word_embeddings
612
+
613
+ def set_input_embeddings(self, value):
614
+ self.embeddings.word_embeddings = value
615
+
616
+ def _prune_heads(self, heads_to_prune):
617
+ """
618
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
619
+ class PreTrainedModel
620
+ """
621
+ for layer, heads in heads_to_prune.items():
622
+ self.encoder.layer[layer].attention.prune_heads(heads)
623
+
624
+ @add_start_docstrings_to_model_forward(MOLFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
625
+ @add_code_sample_docstrings(
626
+ checkpoint=_CHECKPOINT_FOR_DOC,
627
+ output_type=BaseModelOutputWithPooling,
628
+ config_class=_CONFIG_FOR_DOC,
629
+ )
630
+ def forward(
631
+ self,
632
+ input_ids: Optional[torch.LongTensor] = None,
633
+ attention_mask: Optional[torch.FloatTensor] = None,
634
+ position_ids: Optional[torch.LongTensor] = None,
635
+ head_mask: Optional[torch.FloatTensor] = None,
636
+ inputs_embeds: Optional[torch.FloatTensor] = None,
637
+ output_attentions: Optional[bool] = None,
638
+ output_hidden_states: Optional[bool] = None,
639
+ return_dict: Optional[bool] = None,
640
+ ) -> Union[BaseModelOutputWithPooling, Tuple[torch.Tensor]]:
641
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
642
+ output_hidden_states = (
643
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
644
+ )
645
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
646
+
647
+ if input_ids is not None and inputs_embeds is not None:
648
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
649
+ elif input_ids is not None:
650
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
651
+ input_shape = input_ids.size()
652
+ elif inputs_embeds is not None:
653
+ input_shape = inputs_embeds.size()[:-1]
654
+ else:
655
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
656
+
657
+ batch_size, seq_length = input_shape
658
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
659
+
660
+ if position_ids is None:
661
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
662
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
663
+ else:
664
+ position_ids = position_ids.view(-1, seq_length).long()
665
+
666
+ if attention_mask is None:
667
+ attention_mask = torch.ones((batch_size, seq_length), device=device)
668
+
669
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
670
+ # ourselves in which case we just need to make it broadcastable to all heads.
671
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
672
+
673
+ # Prepare head mask if needed
674
+ # 1.0 in head_mask indicate we keep the head
675
+ # attention_probs has shape bsz x n_heads x N x N
676
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
677
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
678
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
679
+
680
+ embedding_output = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds)
681
+
682
+ encoder_outputs = self.encoder(
683
+ embedding_output,
684
+ attention_mask=extended_attention_mask,
685
+ position_ids=position_ids,
686
+ head_mask=head_mask,
687
+ output_attentions=output_attentions,
688
+ output_hidden_states=output_hidden_states,
689
+ return_dict=return_dict,
690
+ )
691
+ sequence_output = encoder_outputs[0]
692
+ sequence_output = self.LayerNorm(sequence_output)
693
+ pooled_output = self.pooler(sequence_output, attention_mask) if self.pooler is not None else None
694
+
695
+ if not return_dict:
696
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
697
+
698
+ return BaseModelOutputWithPooling(
699
+ last_hidden_state=sequence_output,
700
+ pooler_output=pooled_output,
701
+ hidden_states=encoder_outputs.hidden_states,
702
+ attentions=encoder_outputs.attentions,
703
+ )
704
+
705
+
706
+ @add_start_docstrings("""Molformer Model with a `language modeling` head on top.""", MOLFORMER_START_DOCSTRING)
707
+ class MolformerForMaskedLM(MolformerPreTrainedModel):
708
+ _tied_weights_keys = ["lm_head.decoder.weight"]
709
+
710
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Molformer,roberta->molformer,LMHead->LMPredictionHead
711
+ def __init__(self, config):
712
+ super().__init__(config)
713
+
714
+ if config.is_decoder:
715
+ logger.warning(
716
+ "If you want to use `MolformerForMaskedLM` make sure `config.is_decoder=False` for "
717
+ "bi-directional self-attention."
718
+ )
719
+
720
+ self.molformer = MolformerModel(config, add_pooling_layer=False)
721
+ self.lm_head = MolformerLMPredictionHead(config)
722
+
723
+ # Initialize weights and apply final processing
724
+ self.post_init()
725
+
726
+ def get_output_embeddings(self):
727
+ return self.lm_head.decoder
728
+
729
+ def set_output_embeddings(self, new_embeddings):
730
+ self.lm_head.decoder = new_embeddings
731
+
732
+ @add_start_docstrings_to_model_forward(MOLFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
733
+ @add_code_sample_docstrings(
734
+ checkpoint=_CHECKPOINT_FOR_DOC,
735
+ output_type=MaskedLMOutput,
736
+ config_class=_CONFIG_FOR_DOC,
737
+ mask="P<mask>", # add extra token so labels line up
738
+ )
739
+ def forward(
740
+ self,
741
+ input_ids: Optional[torch.LongTensor] = None,
742
+ attention_mask: Optional[torch.FloatTensor] = None,
743
+ position_ids: Optional[torch.LongTensor] = None,
744
+ head_mask: Optional[torch.FloatTensor] = None,
745
+ inputs_embeds: Optional[torch.FloatTensor] = None,
746
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
747
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
748
+ labels: Optional[torch.LongTensor] = None,
749
+ output_attentions: Optional[bool] = None,
750
+ output_hidden_states: Optional[bool] = None,
751
+ return_dict: Optional[bool] = None,
752
+ ) -> Union[MaskedLMOutput, Tuple[torch.Tensor]]:
753
+ r"""
754
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
755
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
756
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
757
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
758
+ """
759
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
760
+
761
+ outputs = self.molformer(
762
+ input_ids,
763
+ attention_mask=attention_mask,
764
+ position_ids=position_ids,
765
+ head_mask=head_mask,
766
+ inputs_embeds=inputs_embeds,
767
+ output_attentions=output_attentions,
768
+ output_hidden_states=output_hidden_states,
769
+ return_dict=return_dict,
770
+ )
771
+
772
+ sequence_output = outputs[0]
773
+ prediction_scores = self.lm_head(sequence_output)
774
+
775
+ masked_lm_loss = None
776
+ if labels is not None:
777
+ # move labels to correct device to enable model parallelism
778
+ labels = labels.to(prediction_scores.device)
779
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
780
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
781
+
782
+ if not return_dict:
783
+ output = (prediction_scores,) + outputs[2:]
784
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
785
+
786
+ return MaskedLMOutput(
787
+ loss=masked_lm_loss,
788
+ logits=prediction_scores,
789
+ hidden_states=outputs.hidden_states,
790
+ attentions=outputs.attentions,
791
+ )
792
+
793
+
794
+ class MolformerClassificationHead(nn.Module):
795
+ """Head for sequence-level classification tasks."""
796
+
797
+ def __init__(self, config):
798
+ super().__init__()
799
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
800
+ self.dense2 = nn.Linear(config.hidden_size, config.hidden_size)
801
+ self.dropout = nn.Dropout(
802
+ config.classifier_dropout_prob
803
+ if config.classifier_dropout_prob is not None
804
+ else config.hidden_dropout_prob
805
+ )
806
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
807
+ if isinstance(config.hidden_act, str):
808
+ self.classifier_act_fn = ACT2FN[config.hidden_act]
809
+ else:
810
+ self.classifier_act_fn = config.hidden_act
811
+ self.skip_connection = config.classifier_skip_connection
812
+
813
+ def forward(self, pooled_output):
814
+ hidden_state = self.dense(pooled_output)
815
+ hidden_state = self.dropout(hidden_state)
816
+ hidden_state = self.classifier_act_fn(hidden_state)
817
+ if self.skip_connection:
818
+ hidden_state = residual = hidden_state + pooled_output
819
+ hidden_state = self.dense2(hidden_state)
820
+ hidden_state = self.dropout(hidden_state)
821
+ hidden_state = self.classifier_act_fn(hidden_state)
822
+ if self.skip_connection:
823
+ hidden_state = hidden_state + residual
824
+ logits = self.out_proj(hidden_state)
825
+ return logits
826
+
827
+
828
+ @add_start_docstrings(
829
+ """
830
+ Molformer Model transformer with a sequence classification/regression head on top (a linear layer on top of the
831
+ pooled output) e.g. for MoleculeNet tasks.
832
+ """,
833
+ MOLFORMER_START_DOCSTRING,
834
+ )
835
+ class MolformerForSequenceClassification(MolformerPreTrainedModel):
836
+ def __init__(self, config):
837
+ super().__init__(config)
838
+ self.num_labels = config.num_labels
839
+ self.config = config
840
+
841
+ self.molformer = MolformerModel(config, add_pooling_layer=True)
842
+ self.classifier = MolformerClassificationHead(config)
843
+
844
+ # Initialize weights and apply final processing
845
+ self.post_init()
846
+
847
+ @add_start_docstrings_to_model_forward(MOLFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
848
+ @add_code_sample_docstrings(
849
+ checkpoint=_CHECKPOINT_FOR_DOC,
850
+ output_type=SequenceClassifierOutput,
851
+ config_class=_CONFIG_FOR_DOC,
852
+ )
853
+ def forward(
854
+ self,
855
+ input_ids: Optional[torch.LongTensor] = None,
856
+ attention_mask: Optional[torch.FloatTensor] = None,
857
+ position_ids: Optional[torch.LongTensor] = None,
858
+ head_mask: Optional[torch.FloatTensor] = None,
859
+ inputs_embeds: Optional[torch.FloatTensor] = None,
860
+ labels: Optional[torch.LongTensor] = None,
861
+ output_attentions: Optional[bool] = None,
862
+ output_hidden_states: Optional[bool] = None,
863
+ return_dict: Optional[bool] = None,
864
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
865
+ r"""
866
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
867
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
868
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
869
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
870
+ """
871
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
872
+
873
+ outputs = self.molformer(
874
+ input_ids,
875
+ attention_mask=attention_mask,
876
+ position_ids=position_ids,
877
+ head_mask=head_mask,
878
+ inputs_embeds=inputs_embeds,
879
+ output_attentions=output_attentions,
880
+ output_hidden_states=output_hidden_states,
881
+ return_dict=return_dict,
882
+ )
883
+
884
+ pooled_output = outputs[1]
885
+ logits = self.classifier(pooled_output)
886
+
887
+ loss = None
888
+ if labels is not None:
889
+ # move labels to correct device to enable model parallelism
890
+ labels = labels.to(logits.device)
891
+ if self.config.problem_type is None:
892
+ if self.num_labels == 1:
893
+ self.config.problem_type = "regression"
894
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
895
+ self.config.problem_type = "single_label_classification"
896
+ else:
897
+ self.config.problem_type = "multi_label_classification"
898
+
899
+ if self.config.problem_type == "regression":
900
+ loss_fct = MSELoss()
901
+ if self.num_labels == 1:
902
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
903
+ else:
904
+ loss = loss_fct(logits, labels)
905
+ elif self.config.problem_type == "single_label_classification":
906
+ loss_fct = CrossEntropyLoss()
907
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
908
+ elif self.config.problem_type == "multi_label_classification":
909
+ loss_fct = BCEWithLogitsLoss()
910
+ loss = loss_fct(logits, labels)
911
+
912
+ if not return_dict:
913
+ output = (logits,) + outputs[2:]
914
+ return ((loss,) + output) if loss is not None else output
915
+
916
+ return SequenceClassifierOutput(
917
+ loss=loss,
918
+ logits=logits,
919
+ hidden_states=outputs.hidden_states,
920
+ attentions=outputs.attentions,
921
+ )
tokenization_molformer.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Molformer."""
16
+
17
+ import collections
18
+ import json
19
+ import os
20
+ import re
21
+ from typing import List, Optional, Tuple
22
+
23
+ from transformers.tokenization_utils import PreTrainedTokenizer
24
+ from transformers.utils import logging
25
+
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
30
+
31
+ PRETRAINED_VOCAB_FILES_MAP = {
32
+ "vocab_file": {
33
+ "ibm/MoLFormer-XL-both-10pct": "https://huggingface.co/ibm/MoLFormer-XL-both-10pct/resolve/main/vocab.json",
34
+ }
35
+ }
36
+
37
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
38
+ "ibm/MoLFormer-XL-both-10pct": 202,
39
+ }
40
+
41
+
42
+ class MolformerTokenizer(PreTrainedTokenizer):
43
+ r"""
44
+ Construct a Molformer tokenizer. Based on regex.
45
+
46
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
47
+ this superclass for more information regarding those methods.
48
+
49
+ Args:
50
+ vocab_file (`str`):
51
+ File containing the vocabulary.
52
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
53
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
54
+ token instead.
55
+ sep_token (`str`, *optional*, defaults to `"<eos>"`):
56
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
57
+ sequence classification or for a text and a question for question answering. It is also used as the last
58
+ token of a sequence built with special tokens.
59
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
60
+ The token used for padding, for example when batching sequences of different lengths.
61
+ cls_token (`str`, *optional*, defaults to `"<bos>"`):
62
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
63
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
64
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
65
+ The token used for masking values. This is the token used when training this model with masked language
66
+ modeling. This is the token which the model will try to predict.
67
+ """
68
+
69
+ vocab_files_names = VOCAB_FILES_NAMES
70
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
71
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
72
+ model_input_names = ["input_ids", "attention_mask"]
73
+
74
+ def __init__(
75
+ self,
76
+ vocab_file,
77
+ unk_token="<unk>",
78
+ sep_token="<eos>",
79
+ pad_token="<pad>",
80
+ cls_token="<bos>",
81
+ mask_token="<mask>",
82
+ **kwargs,
83
+ ):
84
+ if not os.path.isfile(vocab_file):
85
+ raise ValueError(
86
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from an IBM pretrained"
87
+ " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
88
+ )
89
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
90
+ self.vocab = json.load(vocab_handle)
91
+ self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
92
+ self.pattern = (
93
+ r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
94
+ )
95
+ self.regex_tokenizer = re.compile(self.pattern)
96
+
97
+ super().__init__(
98
+ unk_token=unk_token,
99
+ sep_token=sep_token,
100
+ pad_token=pad_token,
101
+ cls_token=cls_token,
102
+ mask_token=mask_token,
103
+ **kwargs,
104
+ )
105
+
106
+ @property
107
+ def vocab_size(self):
108
+ return len(self.vocab)
109
+
110
+ def get_vocab(self):
111
+ return dict(self.vocab, **self.added_tokens_encoder)
112
+
113
+ def _tokenize(self, text):
114
+ split_tokens = self.regex_tokenizer.findall(text)
115
+ return split_tokens
116
+
117
+ def _convert_token_to_id(self, token):
118
+ """Converts a token (str) in an id using the vocab."""
119
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
120
+
121
+ def _convert_id_to_token(self, index):
122
+ """Converts an index (integer) in a token (str) using the vocab."""
123
+ return self.ids_to_tokens.get(index, self.unk_token)
124
+
125
+ def convert_tokens_to_string(self, tokens):
126
+ """Converts a sequence of tokens (string) in a single string."""
127
+ out_string = "".join(tokens).strip()
128
+ return out_string
129
+
130
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
131
+ def build_inputs_with_special_tokens(
132
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
133
+ ) -> List[int]:
134
+ """
135
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
136
+ adding special tokens. A BERT sequence has the following format:
137
+
138
+ - single sequence: `[CLS] X [SEP]`
139
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
140
+
141
+ Args:
142
+ token_ids_0 (`List[int]`):
143
+ List of IDs to which the special tokens will be added.
144
+ token_ids_1 (`List[int]`, *optional*):
145
+ Optional second list of IDs for sequence pairs.
146
+
147
+ Returns:
148
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
149
+ """
150
+ if token_ids_1 is None:
151
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
152
+ cls = [self.cls_token_id]
153
+ sep = [self.sep_token_id]
154
+ return cls + token_ids_0 + sep + token_ids_1 + sep
155
+
156
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
157
+ def get_special_tokens_mask(
158
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
159
+ ) -> List[int]:
160
+ """
161
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
162
+ special tokens using the tokenizer `prepare_for_model` method.
163
+
164
+ Args:
165
+ token_ids_0 (`List[int]`):
166
+ List of IDs.
167
+ token_ids_1 (`List[int]`, *optional*):
168
+ Optional second list of IDs for sequence pairs.
169
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
170
+ Whether or not the token list is already formatted with special tokens for the model.
171
+
172
+ Returns:
173
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
174
+ """
175
+
176
+ if already_has_special_tokens:
177
+ return super().get_special_tokens_mask(
178
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
179
+ )
180
+
181
+ if token_ids_1 is not None:
182
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
183
+ return [1] + ([0] * len(token_ids_0)) + [1]
184
+
185
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
186
+ def create_token_type_ids_from_sequences(
187
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
188
+ ) -> List[int]:
189
+ """
190
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
191
+ pair mask has the following format:
192
+
193
+ ```
194
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
195
+ | first sequence | second sequence |
196
+ ```
197
+
198
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
199
+
200
+ Args:
201
+ token_ids_0 (`List[int]`):
202
+ List of IDs.
203
+ token_ids_1 (`List[int]`, *optional*):
204
+ Optional second list of IDs for sequence pairs.
205
+
206
+ Returns:
207
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
208
+ """
209
+ sep = [self.sep_token_id]
210
+ cls = [self.cls_token_id]
211
+ if token_ids_1 is None:
212
+ return len(cls + token_ids_0 + sep) * [0]
213
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
214
+
215
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
216
+ if not os.path.isdir(save_directory):
217
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
218
+ return
219
+ vocab_file = os.path.join(
220
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
221
+ )
222
+
223
+ with open(vocab_file, "w", encoding="utf-8") as f:
224
+ f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
225
+
226
+ return (vocab_file,)
tokenization_molformer_fast.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Molformer."""
16
+ from typing import List, Optional, Tuple
17
+
18
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
19
+ from transformers.utils import logging
20
+ from .tokenization_molformer import MolformerTokenizer
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}
26
+
27
+ PRETRAINED_VOCAB_FILES_MAP = {
28
+ "vocab_file": {
29
+ "ibm/MoLFormer-XL-both-10pct": "https://huggingface.co/ibm/MoLFormer-XL-both-10pct/resolve/main/vocab.json",
30
+ }
31
+ }
32
+
33
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
34
+ "ibm/MoLFormer-XL-both-10pct": 202,
35
+ }
36
+
37
+
38
+ class MolformerTokenizerFast(PreTrainedTokenizerFast):
39
+ r"""
40
+ Construct a "fast" Molformer tokenizer.
41
+
42
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
43
+ refer to this superclass for more information regarding those methods.
44
+
45
+ Args:
46
+ vocab_file (`str`, *optional*):
47
+ File containing the vocabulary.
48
+ tokenizer_file (`str`, *optional*):
49
+ The path to a tokenizer file to use instead of the vocab file.
50
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
51
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
52
+ token instead.
53
+ sep_token (`str`, *optional*, defaults to `"<eos>"`):
54
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
55
+ sequence classification or for a text and a question for question answering. It is also used as the last
56
+ token of a sequence built with special tokens.
57
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
58
+ The token used for padding, for example when batching sequences of different lengths.
59
+ cls_token (`str`, *optional*, defaults to `"<bos>"`):
60
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
61
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
62
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
63
+ The token used for masking values. This is the token used when training this model with masked language
64
+ modeling. This is the token which the model will try to predict.
65
+ """
66
+
67
+ vocab_files_names = VOCAB_FILES_NAMES
68
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
69
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
70
+ model_input_names = ["input_ids", "attention_mask"]
71
+ slow_tokenizer_class = MolformerTokenizer
72
+
73
+ def __init__(
74
+ self,
75
+ vocab_file=None,
76
+ tokenizer_file=None,
77
+ unk_token="<unk>",
78
+ sep_token="<eos>",
79
+ pad_token="<pad>",
80
+ cls_token="<bos>",
81
+ mask_token="<mask>",
82
+ **kwargs,
83
+ ):
84
+ super().__init__(
85
+ vocab_file,
86
+ tokenizer_file=tokenizer_file,
87
+ unk_token=unk_token,
88
+ sep_token=sep_token,
89
+ pad_token=pad_token,
90
+ cls_token=cls_token,
91
+ mask_token=mask_token,
92
+ **kwargs,
93
+ )
94
+
95
+ # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
96
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
97
+ """
98
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
99
+ adding special tokens. A BERT sequence has the following format:
100
+
101
+ - single sequence: `[CLS] X [SEP]`
102
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
103
+
104
+ Args:
105
+ token_ids_0 (`List[int]`):
106
+ List of IDs to which the special tokens will be added.
107
+ token_ids_1 (`List[int]`, *optional*):
108
+ Optional second list of IDs for sequence pairs.
109
+
110
+ Returns:
111
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
112
+ """
113
+ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
114
+
115
+ if token_ids_1 is not None:
116
+ output += token_ids_1 + [self.sep_token_id]
117
+
118
+ return output
119
+
120
+ # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
121
+ def create_token_type_ids_from_sequences(
122
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
123
+ ) -> List[int]:
124
+ """
125
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
126
+ pair mask has the following format:
127
+
128
+ ```
129
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
130
+ | first sequence | second sequence |
131
+ ```
132
+
133
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
134
+
135
+ Args:
136
+ token_ids_0 (`List[int]`):
137
+ List of IDs.
138
+ token_ids_1 (`List[int]`, *optional*):
139
+ Optional second list of IDs for sequence pairs.
140
+
141
+ Returns:
142
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
143
+ """
144
+ sep = [self.sep_token_id]
145
+ cls = [self.cls_token_id]
146
+ if token_ids_1 is None:
147
+ return len(cls + token_ids_0 + sep) * [0]
148
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
149
+
150
+ # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
151
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
152
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
153
+ return tuple(files)
tokenizer.json ADDED
@@ -0,0 +1,2520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<bos>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<eos>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<pad>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<mask>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 2361,
44
+ "content": "<unk>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "Sequence",
55
+ "pretokenizers": [
56
+ {
57
+ "type": "Split",
58
+ "pattern": {
59
+ "Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|>|\\*|\\$|\\%[0-9]{2}|[0-9])"
60
+ },
61
+ "behavior": "Removed",
62
+ "invert": true
63
+ },
64
+ {
65
+ "type": "Split",
66
+ "pattern": {
67
+ "Regex": "(\\[[^\\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\\(|\\)|\\.|=|#|-|\\+|\\\\|\\/|:|~|@|\\?|>|\\*|\\$|\\%[0-9]{2}|[0-9])"
68
+ },
69
+ "behavior": "Isolated",
70
+ "invert": false
71
+ }
72
+ ]
73
+ },
74
+ "post_processor": {
75
+ "type": "TemplateProcessing",
76
+ "single": [
77
+ {
78
+ "SpecialToken": {
79
+ "id": "<bos>",
80
+ "type_id": 0
81
+ }
82
+ },
83
+ {
84
+ "Sequence": {
85
+ "id": "A",
86
+ "type_id": 0
87
+ }
88
+ },
89
+ {
90
+ "SpecialToken": {
91
+ "id": "<eos>",
92
+ "type_id": 0
93
+ }
94
+ }
95
+ ],
96
+ "pair": [
97
+ {
98
+ "SpecialToken": {
99
+ "id": "<bos>",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "Sequence": {
105
+ "id": "A",
106
+ "type_id": 0
107
+ }
108
+ },
109
+ {
110
+ "SpecialToken": {
111
+ "id": "<eos>",
112
+ "type_id": 0
113
+ }
114
+ },
115
+ {
116
+ "Sequence": {
117
+ "id": "B",
118
+ "type_id": 1
119
+ }
120
+ },
121
+ {
122
+ "SpecialToken": {
123
+ "id": "<eos>",
124
+ "type_id": 1
125
+ }
126
+ }
127
+ ],
128
+ "special_tokens": {
129
+ "<bos>": {
130
+ "id": "<bos>",
131
+ "ids": [
132
+ 0
133
+ ],
134
+ "tokens": [
135
+ "<bos>"
136
+ ]
137
+ },
138
+ "<eos>": {
139
+ "id": "<eos>",
140
+ "ids": [
141
+ 1
142
+ ],
143
+ "tokens": [
144
+ "<eos>"
145
+ ]
146
+ }
147
+ }
148
+ },
149
+ "decoder": {
150
+ "type": "Fuse"
151
+ },
152
+ "model": {
153
+ "type": "WordLevel",
154
+ "vocab": {
155
+ "<bos>": 0,
156
+ "<eos>": 1,
157
+ "<pad>": 2,
158
+ "<mask>": 3,
159
+ "C": 4,
160
+ "c": 5,
161
+ "(": 6,
162
+ ")": 7,
163
+ "1": 8,
164
+ "O": 9,
165
+ "N": 10,
166
+ "2": 11,
167
+ "=": 12,
168
+ "n": 13,
169
+ "3": 14,
170
+ "[C@H]": 15,
171
+ "[C@@H]": 16,
172
+ "F": 17,
173
+ "S": 18,
174
+ "4": 19,
175
+ "Cl": 20,
176
+ "-": 21,
177
+ "o": 22,
178
+ "s": 23,
179
+ "[nH]": 24,
180
+ "#": 25,
181
+ "/": 26,
182
+ "Br": 27,
183
+ "[C@]": 28,
184
+ "[C@@]": 29,
185
+ "[N+]": 30,
186
+ "[O-]": 31,
187
+ "5": 32,
188
+ "\\": 33,
189
+ ".": 34,
190
+ "I": 35,
191
+ "6": 36,
192
+ "[S@]": 37,
193
+ "[S@@]": 38,
194
+ "P": 39,
195
+ "[N-]": 40,
196
+ "[Si]": 41,
197
+ "7": 42,
198
+ "[n+]": 43,
199
+ "[2H]": 44,
200
+ "8": 45,
201
+ "[NH+]": 46,
202
+ "B": 47,
203
+ "9": 48,
204
+ "[C-]": 49,
205
+ "[Na+]": 50,
206
+ "[Cl-]": 51,
207
+ "[c-]": 52,
208
+ "[CH]": 53,
209
+ "%10": 54,
210
+ "[NH2+]": 55,
211
+ "[P+]": 56,
212
+ "[B]": 57,
213
+ "[I-]": 58,
214
+ "%11": 59,
215
+ "[CH2-]": 60,
216
+ "[O+]": 61,
217
+ "[NH3+]": 62,
218
+ "[C]": 63,
219
+ "[Br-]": 64,
220
+ "[IH2]": 65,
221
+ "[S-]": 66,
222
+ "[cH-]": 67,
223
+ "%12": 68,
224
+ "[nH+]": 69,
225
+ "[B-]": 70,
226
+ "[K+]": 71,
227
+ "[Sn]": 72,
228
+ "[Se]": 73,
229
+ "[CH-]": 74,
230
+ "[HH]": 75,
231
+ "[Y]": 76,
232
+ "[n-]": 77,
233
+ "[CH3-]": 78,
234
+ "[SiH]": 79,
235
+ "[S+]": 80,
236
+ "%13": 81,
237
+ "[SiH2]": 82,
238
+ "[Li+]": 83,
239
+ "[NH-]": 84,
240
+ "%14": 85,
241
+ "[Na]": 86,
242
+ "[CH2]": 87,
243
+ "[O-2]": 88,
244
+ "[U+2]": 89,
245
+ "[W]": 90,
246
+ "[Al]": 91,
247
+ "[P@]": 92,
248
+ "[Fe+2]": 93,
249
+ "[PH+]": 94,
250
+ "%15": 95,
251
+ "[Cl+3]": 96,
252
+ "[Zn+2]": 97,
253
+ "[Ir]": 98,
254
+ "[Mg+2]": 99,
255
+ "[Pt+2]": 100,
256
+ "[OH2+]": 101,
257
+ "[As]": 102,
258
+ "[Fe]": 103,
259
+ "[OH+]": 104,
260
+ "[Zr+2]": 105,
261
+ "[3H]": 106,
262
+ "[Ge]": 107,
263
+ "[SiH3]": 108,
264
+ "[OH-]": 109,
265
+ "[NH4+]": 110,
266
+ "[Cu+2]": 111,
267
+ "[P@@]": 112,
268
+ "p": 113,
269
+ "[Pt]": 114,
270
+ "%16": 115,
271
+ "[Ca+2]": 116,
272
+ "[Zr]": 117,
273
+ "[F-]": 118,
274
+ "[C+]": 119,
275
+ "[Ti]": 120,
276
+ "[P-]": 121,
277
+ "[V]": 122,
278
+ "[se]": 123,
279
+ "[U]": 124,
280
+ "[O]": 125,
281
+ "[Ni+2]": 126,
282
+ "[Zn]": 127,
283
+ "[Co]": 128,
284
+ "[Ni]": 129,
285
+ "[Pd+2]": 130,
286
+ "[Cu]": 131,
287
+ "%17": 132,
288
+ "[Cu+]": 133,
289
+ "[Te]": 134,
290
+ "[H+]": 135,
291
+ "[CH+]": 136,
292
+ "[Li]": 137,
293
+ "[Pd]": 138,
294
+ "[Mo]": 139,
295
+ "[Ru+2]": 140,
296
+ "[o+]": 141,
297
+ "[Re]": 142,
298
+ "[SH+]": 143,
299
+ "%18": 144,
300
+ "[Ac]": 145,
301
+ "[Cr]": 146,
302
+ "[NH2-]": 147,
303
+ "[K]": 148,
304
+ "[13CH2]": 149,
305
+ "[c]": 150,
306
+ "[Zr+4]": 151,
307
+ "[Tl]": 152,
308
+ "[13C]": 153,
309
+ "[Mn]": 154,
310
+ "[N@+]": 155,
311
+ "[Hg]": 156,
312
+ "[Rh]": 157,
313
+ "[Ti+4]": 158,
314
+ "[Sb]": 159,
315
+ "[Co+2]": 160,
316
+ "[Ag+]": 161,
317
+ "[Ru]": 162,
318
+ "%19": 163,
319
+ "[N@@+]": 164,
320
+ "[Ti+2]": 165,
321
+ "[Al+3]": 166,
322
+ "[Pb]": 167,
323
+ "[I+]": 168,
324
+ "[18F]": 169,
325
+ "[s+]": 170,
326
+ "[Rb+]": 171,
327
+ "[Ba+2]": 172,
328
+ "[H-]": 173,
329
+ "[Fe+3]": 174,
330
+ "[Ir+3]": 175,
331
+ "[13cH]": 176,
332
+ "%20": 177,
333
+ "[AlH2]": 178,
334
+ "[Au+]": 179,
335
+ "[13c]": 180,
336
+ "[SH2+]": 181,
337
+ "[Sn+2]": 182,
338
+ "[Mn+2]": 183,
339
+ "[Si-]": 184,
340
+ "[Ag]": 185,
341
+ "[N]": 186,
342
+ "[Bi]": 187,
343
+ "%21": 188,
344
+ "[In]": 189,
345
+ "[CH2+]": 190,
346
+ "[Y+3]": 191,
347
+ "[Ga]": 192,
348
+ "%22": 193,
349
+ "[Co+3]": 194,
350
+ "[Au]": 195,
351
+ "[13CH3]": 196,
352
+ "[Mg]": 197,
353
+ "[Cs+]": 198,
354
+ "[W+2]": 199,
355
+ "[Hf]": 200,
356
+ "[Zn+]": 201,
357
+ "[Se-]": 202,
358
+ "[S-2]": 203,
359
+ "[Ca]": 204,
360
+ "[pH]": 205,
361
+ "[ClH+]": 206,
362
+ "[Ti+3]": 207,
363
+ "%23": 208,
364
+ "[Ru+]": 209,
365
+ "[SH-]": 210,
366
+ "[13CH]": 211,
367
+ "[IH+]": 212,
368
+ "[Hf+4]": 213,
369
+ "[Rf]": 214,
370
+ "[OH3+]": 215,
371
+ "%24": 216,
372
+ "[Pt+4]": 217,
373
+ "[Zr+3]": 218,
374
+ "[PH3+]": 219,
375
+ "[Sr+2]": 220,
376
+ "[Cd+2]": 221,
377
+ "[Cd]": 222,
378
+ "%25": 223,
379
+ "[Os]": 224,
380
+ "[BH-]": 225,
381
+ "[Sn+4]": 226,
382
+ "[Cr+3]": 227,
383
+ "[Ru+3]": 228,
384
+ "[PH2+]": 229,
385
+ "[Rh+2]": 230,
386
+ "[V+2]": 231,
387
+ "%26": 232,
388
+ "[Gd+3]": 233,
389
+ "[Pb+2]": 234,
390
+ "[PH]": 235,
391
+ "[Hg+]": 236,
392
+ "[Mo+2]": 237,
393
+ "[AlH]": 238,
394
+ "[Sn+]": 239,
395
+ "%27": 240,
396
+ "[Pd+]": 241,
397
+ "b": 242,
398
+ "[Rh+3]": 243,
399
+ "[Hg+2]": 244,
400
+ "[15NH]": 245,
401
+ "[14C]": 246,
402
+ "%28": 247,
403
+ "[Mn+3]": 248,
404
+ "[Si+]": 249,
405
+ "[SeH]": 250,
406
+ "[13C@H]": 251,
407
+ "[NH]": 252,
408
+ "[Ga+3]": 253,
409
+ "[SiH-]": 254,
410
+ "[13C@@H]": 255,
411
+ "[Ce]": 256,
412
+ "[Au+3]": 257,
413
+ "[Bi+3]": 258,
414
+ "[15N]": 259,
415
+ "%29": 260,
416
+ "[BH3-]": 261,
417
+ "[14cH]": 262,
418
+ "[Ti+]": 263,
419
+ "[Gd]": 264,
420
+ "[cH+]": 265,
421
+ "[Cr+2]": 266,
422
+ "[Sb-]": 267,
423
+ "%30": 268,
424
+ "[Be+2]": 269,
425
+ "[Al+]": 270,
426
+ "[te]": 271,
427
+ "[11CH3]": 272,
428
+ "[Sm]": 273,
429
+ "[Pr]": 274,
430
+ "[La]": 275,
431
+ "%31": 276,
432
+ "[Al-]": 277,
433
+ "[Ta]": 278,
434
+ "[125I]": 279,
435
+ "[BH2-]": 280,
436
+ "[Nb]": 281,
437
+ "[Si@]": 282,
438
+ "%32": 283,
439
+ "[14c]": 284,
440
+ "[Sb+3]": 285,
441
+ "[Ba]": 286,
442
+ "%33": 287,
443
+ "[Os+2]": 288,
444
+ "[Si@@]": 289,
445
+ "[La+3]": 290,
446
+ "[15n]": 291,
447
+ "[15NH2]": 292,
448
+ "[Nd+3]": 293,
449
+ "%34": 294,
450
+ "[14CH2]": 295,
451
+ "[18O]": 296,
452
+ "[Nd]": 297,
453
+ "[GeH]": 298,
454
+ "[Ni+3]": 299,
455
+ "[Eu]": 300,
456
+ "[Dy+3]": 301,
457
+ "[Sc]": 302,
458
+ "%36": 303,
459
+ "[Se-2]": 304,
460
+ "[As+]": 305,
461
+ "%35": 306,
462
+ "[AsH]": 307,
463
+ "[Tb]": 308,
464
+ "[Sb+5]": 309,
465
+ "[Se+]": 310,
466
+ "[Ce+3]": 311,
467
+ "[c+]": 312,
468
+ "[In+3]": 313,
469
+ "[SnH]": 314,
470
+ "[Mo+4]": 315,
471
+ "%37": 316,
472
+ "[V+4]": 317,
473
+ "[Eu+3]": 318,
474
+ "[Hf+2]": 319,
475
+ "%38": 320,
476
+ "[Pt+]": 321,
477
+ "[p+]": 322,
478
+ "[123I]": 323,
479
+ "[Tl+]": 324,
480
+ "[Sm+3]": 325,
481
+ "%39": 326,
482
+ "[Yb+3]": 327,
483
+ "%40": 328,
484
+ "[Yb]": 329,
485
+ "[Os+]": 330,
486
+ "%41": 331,
487
+ "[10B]": 332,
488
+ "[Sc+3]": 333,
489
+ "[Al+2]": 334,
490
+ "%42": 335,
491
+ "[Sr]": 336,
492
+ "[Tb+3]": 337,
493
+ "[Po]": 338,
494
+ "[Tc]": 339,
495
+ "[PH-]": 340,
496
+ "[AlH3]": 341,
497
+ "[Ar]": 342,
498
+ "[U+4]": 343,
499
+ "[SnH2]": 344,
500
+ "[Cl+2]": 345,
501
+ "[si]": 346,
502
+ "[Fe+]": 347,
503
+ "[14CH3]": 348,
504
+ "[U+3]": 349,
505
+ "[Cl+]": 350,
506
+ "%43": 351,
507
+ "[GeH2]": 352,
508
+ "%44": 353,
509
+ "[Er+3]": 354,
510
+ "[Mo+3]": 355,
511
+ "[I+2]": 356,
512
+ "[Fe+4]": 357,
513
+ "[99Tc]": 358,
514
+ "%45": 359,
515
+ "[11C]": 360,
516
+ "%46": 361,
517
+ "[SnH3]": 362,
518
+ "[S]": 363,
519
+ "[Te+]": 364,
520
+ "[Er]": 365,
521
+ "[Lu+3]": 366,
522
+ "[11B]": 367,
523
+ "%47": 368,
524
+ "%48": 369,
525
+ "[P]": 370,
526
+ "[Tm]": 371,
527
+ "[Th]": 372,
528
+ "[Dy]": 373,
529
+ "[Pr+3]": 374,
530
+ "[Ta+5]": 375,
531
+ "[Nb+5]": 376,
532
+ "[Rb]": 377,
533
+ "[GeH3]": 378,
534
+ "[Br+2]": 379,
535
+ "%49": 380,
536
+ "[131I]": 381,
537
+ "[Fm]": 382,
538
+ "[Cs]": 383,
539
+ "[BH4-]": 384,
540
+ "[Lu]": 385,
541
+ "[15nH]": 386,
542
+ "%50": 387,
543
+ "[Ru+6]": 388,
544
+ "[b-]": 389,
545
+ "[Ho]": 390,
546
+ "[Th+4]": 391,
547
+ "[Ru+4]": 392,
548
+ "%52": 393,
549
+ "[14CH]": 394,
550
+ "%51": 395,
551
+ "[Cr+6]": 396,
552
+ "[18OH]": 397,
553
+ "[Ho+3]": 398,
554
+ "[Ce+4]": 399,
555
+ "[Bi+2]": 400,
556
+ "[Co+]": 401,
557
+ "%53": 402,
558
+ "[Yb+2]": 403,
559
+ "[Fe+6]": 404,
560
+ "[Be]": 405,
561
+ "%54": 406,
562
+ "[SH3+]": 407,
563
+ "[Np]": 408,
564
+ "[As-]": 409,
565
+ "%55": 410,
566
+ "[14C@@H]": 411,
567
+ "[Ir+2]": 412,
568
+ "[GaH3]": 413,
569
+ "[p-]": 414,
570
+ "[GeH4]": 415,
571
+ "[Sn+3]": 416,
572
+ "[Os+4]": 417,
573
+ "%56": 418,
574
+ "[14C@H]": 419,
575
+ "[sH+]": 420,
576
+ "[19F]": 421,
577
+ "[Eu+2]": 422,
578
+ "[TlH]": 423,
579
+ "%57": 424,
580
+ "[Cr+4]": 425,
581
+ "%58": 426,
582
+ "[B@@-]": 427,
583
+ "[SiH+]": 428,
584
+ "[At]": 429,
585
+ "[Am]": 430,
586
+ "[Fe+5]": 431,
587
+ "[AsH2]": 432,
588
+ "[Si+4]": 433,
589
+ "[B@-]": 434,
590
+ "[Pu]": 435,
591
+ "[SbH]": 436,
592
+ "[P-2]": 437,
593
+ "[Tm+3]": 438,
594
+ "*": 439,
595
+ "%59": 440,
596
+ "[se+]": 441,
597
+ "%60": 442,
598
+ "[oH+]": 443,
599
+ "[1H]": 444,
600
+ "[15N+]": 445,
601
+ "[124I]": 446,
602
+ "[S@@+]": 447,
603
+ "[P-3]": 448,
604
+ "[H]": 449,
605
+ "[IH2+]": 450,
606
+ "[TeH]": 451,
607
+ "[Xe]": 452,
608
+ "[PH4+]": 453,
609
+ "[Cr+]": 454,
610
+ "[Cm]": 455,
611
+ "[I+3]": 456,
612
+ "%61": 457,
613
+ "[Nb+2]": 458,
614
+ "[Ru+5]": 459,
615
+ "%62": 460,
616
+ "[Ta+2]": 461,
617
+ "[Tc+4]": 462,
618
+ "[CH3+]": 463,
619
+ "[Pm]": 464,
620
+ "[Si@H]": 465,
621
+ "[No]": 466,
622
+ "%63": 467,
623
+ "[Cr+5]": 468,
624
+ "[Th+2]": 469,
625
+ "[Zn-2]": 470,
626
+ "[13C@]": 471,
627
+ "[Lr]": 472,
628
+ "%64": 473,
629
+ "[99Tc+3]": 474,
630
+ "%65": 475,
631
+ "[13C@@]": 476,
632
+ "%66": 477,
633
+ "[Fe-]": 478,
634
+ "[17O]": 479,
635
+ "[siH]": 480,
636
+ "[Sb+]": 481,
637
+ "[OH]": 482,
638
+ "[IH]": 483,
639
+ "[11CH2]": 484,
640
+ "[Cf]": 485,
641
+ "[SiH2+]": 486,
642
+ "[Gd+2]": 487,
643
+ "[In+]": 488,
644
+ "[Si@@H]": 489,
645
+ "[Mn+]": 490,
646
+ "[99Tc+4]": 491,
647
+ "[Ga-]": 492,
648
+ "%67": 493,
649
+ "[S@+]": 494,
650
+ "[Ge+4]": 495,
651
+ "[Tl+3]": 496,
652
+ "[16OH]": 497,
653
+ "%68": 498,
654
+ "[2H-]": 499,
655
+ "[Ra]": 500,
656
+ "[si-]": 501,
657
+ "[NiH2]": 502,
658
+ "[P@@H]": 503,
659
+ "[Rh+]": 504,
660
+ "[12C]": 505,
661
+ "[35S]": 506,
662
+ "[32P]": 507,
663
+ "[SiH2-]": 508,
664
+ "[AlH2+]": 509,
665
+ "[16O]": 510,
666
+ "%69": 511,
667
+ "[BiH]": 512,
668
+ "[BiH2]": 513,
669
+ "[Zn-]": 514,
670
+ "[BH]": 515,
671
+ "[Tc+3]": 516,
672
+ "[Ir+]": 517,
673
+ "[Ni+]": 518,
674
+ "%70": 519,
675
+ "[InH2]": 520,
676
+ "[InH]": 521,
677
+ "[Nb+3]": 522,
678
+ "[PbH]": 523,
679
+ "[Bi+]": 524,
680
+ "%71": 525,
681
+ "[As+3]": 526,
682
+ "%72": 527,
683
+ "[18O-]": 528,
684
+ "[68Ga+3]": 529,
685
+ "%73": 530,
686
+ "[Pa]": 531,
687
+ "[76Br]": 532,
688
+ "[Tc+5]": 533,
689
+ "[pH+]": 534,
690
+ "[64Cu+2]": 535,
691
+ "[Ru+8]": 536,
692
+ "%74": 537,
693
+ "[PH2-]": 538,
694
+ "[Si+2]": 539,
695
+ "[17OH]": 540,
696
+ "[RuH]": 541,
697
+ "[111In+3]": 542,
698
+ "[AlH+]": 543,
699
+ "%75": 544,
700
+ "%76": 545,
701
+ "[W+]": 546,
702
+ "[SbH2]": 547,
703
+ "[PoH]": 548,
704
+ "[Ru-]": 549,
705
+ "[XeH]": 550,
706
+ "[Tc+2]": 551,
707
+ "[13C-]": 552,
708
+ "[Br+]": 553,
709
+ "[Pt-2]": 554,
710
+ "[Es]": 555,
711
+ "[Cu-]": 556,
712
+ "[Mg+]": 557,
713
+ "[3HH]": 558,
714
+ "[P@H]": 559,
715
+ "[ClH2+]": 560,
716
+ "%77": 561,
717
+ "[SH]": 562,
718
+ "[Au-]": 563,
719
+ "[2HH]": 564,
720
+ "%78": 565,
721
+ "[Sn-]": 566,
722
+ "[11CH]": 567,
723
+ "[PdH2]": 568,
724
+ "0": 569,
725
+ "[Os+6]": 570,
726
+ "%79": 571,
727
+ "[Mo+]": 572,
728
+ "%80": 573,
729
+ "[al]": 574,
730
+ "[PbH2]": 575,
731
+ "[64Cu]": 576,
732
+ "[Cl]": 577,
733
+ "[12CH3]": 578,
734
+ "%81": 579,
735
+ "[Tc+7]": 580,
736
+ "[11c]": 581,
737
+ "%82": 582,
738
+ "[Li-]": 583,
739
+ "[99Tc+5]": 584,
740
+ "[He]": 585,
741
+ "[12c]": 586,
742
+ "[Kr]": 587,
743
+ "[RuH+2]": 588,
744
+ "[35Cl]": 589,
745
+ "[Pd-2]": 590,
746
+ "[GaH2]": 591,
747
+ "[4H]": 592,
748
+ "[Sg]": 593,
749
+ "[Cu-2]": 594,
750
+ "[Br+3]": 595,
751
+ "%83": 596,
752
+ "[37Cl]": 597,
753
+ "[211At]": 598,
754
+ "[IrH+2]": 599,
755
+ "[Mt]": 600,
756
+ "[Ir-2]": 601,
757
+ "[In-]": 602,
758
+ "[12cH]": 603,
759
+ "[12CH2]": 604,
760
+ "[RuH2]": 605,
761
+ "[99Tc+7]": 606,
762
+ "%84": 607,
763
+ "[15n+]": 608,
764
+ "[ClH2+2]": 609,
765
+ "[16N]": 610,
766
+ "[111In]": 611,
767
+ "[Tc+]": 612,
768
+ "[Ru-2]": 613,
769
+ "[12CH]": 614,
770
+ "[si+]": 615,
771
+ "[Tc+6]": 616,
772
+ "%85": 617,
773
+ "%86": 618,
774
+ "[90Y]": 619,
775
+ "[Pd-]": 620,
776
+ "[188Re]": 621,
777
+ "[RuH+]": 622,
778
+ "[NiH]": 623,
779
+ "[SiH3-]": 624,
780
+ "[14n]": 625,
781
+ "[CH3]": 626,
782
+ "[14N]": 627,
783
+ "[10BH2]": 628,
784
+ "%88": 629,
785
+ "%89": 630,
786
+ "%90": 631,
787
+ "[34S]": 632,
788
+ "[77Br]": 633,
789
+ "[GaH]": 634,
790
+ "[Br]": 635,
791
+ "[Ge@]": 636,
792
+ "[B@@H-]": 637,
793
+ "[CuH]": 638,
794
+ "[SiH4]": 639,
795
+ "[3H-]": 640,
796
+ "%87": 641,
797
+ "%91": 642,
798
+ "%92": 643,
799
+ "[67Cu]": 644,
800
+ "[I]": 645,
801
+ "[177Lu]": 646,
802
+ "[ReH]": 647,
803
+ "[67Ga+3]": 648,
804
+ "[Db]": 649,
805
+ "[177Lu+3]": 650,
806
+ "[AlH2-]": 651,
807
+ "[Si+3]": 652,
808
+ "[Ti-2]": 653,
809
+ "[RuH+3]": 654,
810
+ "[al+]": 655,
811
+ "[68Ga]": 656,
812
+ "[2H+]": 657,
813
+ "[B@H-]": 658,
814
+ "[WH2]": 659,
815
+ "[OsH]": 660,
816
+ "[Ir-3]": 661,
817
+ "[AlH-]": 662,
818
+ "[Bk]": 663,
819
+ "[75Se]": 664,
820
+ "[14C@]": 665,
821
+ "[Pt-]": 666,
822
+ "[N@@H+]": 667,
823
+ "[Nb-]": 668,
824
+ "[13NH2]": 669,
825
+ "%93": 670,
826
+ "[186Re]": 671,
827
+ "[Tb+4]": 672,
828
+ "[PtH]": 673,
829
+ "[IrH2]": 674,
830
+ "[Hg-2]": 675,
831
+ "[AlH3-]": 676,
832
+ "[PdH+]": 677,
833
+ "[Md]": 678,
834
+ "[RhH+2]": 679,
835
+ "[11cH]": 680,
836
+ "[Co-2]": 681,
837
+ "[15N-]": 682,
838
+ "[ZrH2]": 683,
839
+ "%94": 684,
840
+ "[Hg-]": 685,
841
+ "[127I]": 686,
842
+ "[AsH2+]": 687,
843
+ "[MoH2]": 688,
844
+ "[Te+4]": 689,
845
+ "[14C@@]": 690,
846
+ "[As+5]": 691,
847
+ "[SnH+3]": 692,
848
+ "[Ge@@]": 693,
849
+ "[6Li+]": 694,
850
+ "[WH]": 695,
851
+ "[Ne]": 696,
852
+ "[14NH2]": 697,
853
+ "[14NH]": 698,
854
+ "[12C@@H]": 699,
855
+ "[Os+7]": 700,
856
+ "[RhH]": 701,
857
+ "[Al-3]": 702,
858
+ "[SnH+]": 703,
859
+ "[15NH3+]": 704,
860
+ "[Zr+]": 705,
861
+ "[197Hg+]": 706,
862
+ "%95": 707,
863
+ "%96": 708,
864
+ "[90Y+3]": 709,
865
+ "[Os-2]": 710,
866
+ "[98Tc+5]": 711,
867
+ "[15NH3]": 712,
868
+ "[bH-]": 713,
869
+ "[33P]": 714,
870
+ "[Zr-2]": 715,
871
+ "[15O]": 716,
872
+ "[Rh-]": 717,
873
+ "[PbH3]": 718,
874
+ "[PH2]": 719,
875
+ "[Ni-]": 720,
876
+ "[CuH+]": 721,
877
+ "%97": 722,
878
+ "%98": 723,
879
+ "%99": 724,
880
+ "[Os+5]": 725,
881
+ "[PtH+]": 726,
882
+ "[ReH4]": 727,
883
+ "[16NH]": 728,
884
+ "[82Br]": 729,
885
+ "[W-]": 730,
886
+ "[18F-]": 731,
887
+ "[15NH4+]": 732,
888
+ "[Se+4]": 733,
889
+ "[SeH-]": 734,
890
+ "[67Cu+2]": 735,
891
+ "[12C@H]": 736,
892
+ "[AsH3]": 737,
893
+ "[HgH]": 738,
894
+ "[10B-]": 739,
895
+ "[99Tc+6]": 740,
896
+ "[117Sn+4]": 741,
897
+ "[Te@]": 742,
898
+ "[P@+]": 743,
899
+ "[35SH]": 744,
900
+ "[SeH+]": 745,
901
+ "[Ni-2]": 746,
902
+ "[Al-2]": 747,
903
+ "[TeH2]": 748,
904
+ "[Bh]": 749,
905
+ "[99Tc+2]": 750,
906
+ "[Os+8]": 751,
907
+ "[PH-2]": 752,
908
+ "[7Li+]": 753,
909
+ "[14nH]": 754,
910
+ "[AlH+2]": 755,
911
+ "[18FH]": 756,
912
+ "[SnH4]": 757,
913
+ "[18O-2]": 758,
914
+ "[IrH]": 759,
915
+ "[13N]": 760,
916
+ "[Te@@]": 761,
917
+ "[Rh-3]": 762,
918
+ "[15NH+]": 763,
919
+ "[AsH3+]": 764,
920
+ "[SeH2]": 765,
921
+ "[AsH+]": 766,
922
+ "[CoH2]": 767,
923
+ "[16NH2]": 768,
924
+ "[AsH-]": 769,
925
+ "[203Hg+]": 770,
926
+ "[P@@+]": 771,
927
+ "[166Ho+3]": 772,
928
+ "[60Co+3]": 773,
929
+ "[13CH2-]": 774,
930
+ "[SeH2+]": 775,
931
+ "[75Br]": 776,
932
+ "[TlH2]": 777,
933
+ "[80Br]": 778,
934
+ "[siH+]": 779,
935
+ "[Ca+]": 780,
936
+ "[153Sm+3]": 781,
937
+ "[PdH]": 782,
938
+ "[225Ac]": 783,
939
+ "[13CH3-]": 784,
940
+ "[AlH4-]": 785,
941
+ "[FeH]": 786,
942
+ "[13CH-]": 787,
943
+ "[14C-]": 788,
944
+ "[11C-]": 789,
945
+ "[153Sm]": 790,
946
+ "[Re-]": 791,
947
+ "[te+]": 792,
948
+ "[13CH4]": 793,
949
+ "[ClH+2]": 794,
950
+ "[8CH2]": 795,
951
+ "[99Mo]": 796,
952
+ "[ClH3+3]": 797,
953
+ "[SbH3]": 798,
954
+ "[25Mg+2]": 799,
955
+ "[16N+]": 800,
956
+ "[SnH2+]": 801,
957
+ "[11C@H]": 802,
958
+ "[122I]": 803,
959
+ "[Re-2]": 804,
960
+ "[RuH2+2]": 805,
961
+ "[ZrH]": 806,
962
+ "[Bi-]": 807,
963
+ "[Pr+]": 808,
964
+ "[Rn]": 809,
965
+ "[Fr]": 810,
966
+ "[36Cl]": 811,
967
+ "[18o]": 812,
968
+ "[YH]": 813,
969
+ "[79Br]": 814,
970
+ "[121I]": 815,
971
+ "[113In+3]": 816,
972
+ "[TaH]": 817,
973
+ "[RhH2]": 818,
974
+ "[Ta-]": 819,
975
+ "[67Ga]": 820,
976
+ "[ZnH+]": 821,
977
+ "[SnH2-]": 822,
978
+ "[OsH2]": 823,
979
+ "[16F]": 824,
980
+ "[FeH2]": 825,
981
+ "[14O]": 826,
982
+ "[PbH2+2]": 827,
983
+ "[BH2]": 828,
984
+ "[6H]": 829,
985
+ "[125Te]": 830,
986
+ "[197Hg]": 831,
987
+ "[TaH2]": 832,
988
+ "[TaH3]": 833,
989
+ "[76As]": 834,
990
+ "[Nb-2]": 835,
991
+ "[14N+]": 836,
992
+ "[125I-]": 837,
993
+ "[33S]": 838,
994
+ "[IH2+2]": 839,
995
+ "[NH2]": 840,
996
+ "[PtH2]": 841,
997
+ "[MnH]": 842,
998
+ "[19C]": 843,
999
+ "[17F]": 844,
1000
+ "[1H-]": 845,
1001
+ "[SnH4+2]": 846,
1002
+ "[Mn-2]": 847,
1003
+ "[15NH2+]": 848,
1004
+ "[TiH2]": 849,
1005
+ "[ReH7]": 850,
1006
+ "[Cd-2]": 851,
1007
+ "[Fe-3]": 852,
1008
+ "[SH2]": 853,
1009
+ "[17O-]": 854,
1010
+ "[siH-]": 855,
1011
+ "[CoH+]": 856,
1012
+ "[VH]": 857,
1013
+ "[10BH]": 858,
1014
+ "[Ru-3]": 859,
1015
+ "[13O]": 860,
1016
+ "[5H]": 861,
1017
+ "[15n-]": 862,
1018
+ "[153Gd]": 863,
1019
+ "[12C@]": 864,
1020
+ "[11CH3-]": 865,
1021
+ "[IrH3]": 866,
1022
+ "[RuH3]": 867,
1023
+ "[74Se]": 868,
1024
+ "[Se@]": 869,
1025
+ "[Hf+]": 870,
1026
+ "[77Se]": 871,
1027
+ "[166Ho]": 872,
1028
+ "[59Fe+2]": 873,
1029
+ "[203Hg]": 874,
1030
+ "[18OH-]": 875,
1031
+ "[8CH]": 876,
1032
+ "[12C@@]": 877,
1033
+ "[11CH4]": 878,
1034
+ "[15C]": 879,
1035
+ "[249Cf]": 880,
1036
+ "[PbH4]": 881,
1037
+ "[64Zn]": 882,
1038
+ "[99Tc+]": 883,
1039
+ "[14c-]": 884,
1040
+ "[149Pm]": 885,
1041
+ "[IrH4]": 886,
1042
+ "[Se@@]": 887,
1043
+ "[13OH]": 888,
1044
+ "[14CH3-]": 889,
1045
+ "[28Si]": 890,
1046
+ "[Rh-2]": 891,
1047
+ "[Fe-2]": 892,
1048
+ "[131I-]": 893,
1049
+ "[51Cr]": 894,
1050
+ "[62Cu+2]": 895,
1051
+ "[81Br]": 896,
1052
+ "[121Sb]": 897,
1053
+ "[7Li]": 898,
1054
+ "[89Zr+4]": 899,
1055
+ "[SbH3+]": 900,
1056
+ "[11C@@H]": 901,
1057
+ "[98Tc]": 902,
1058
+ "[59Fe+3]": 903,
1059
+ "[BiH2+]": 904,
1060
+ "[SbH+]": 905,
1061
+ "[TiH]": 906,
1062
+ "[14NH3]": 907,
1063
+ "[15OH]": 908,
1064
+ "[119Sn]": 909,
1065
+ "[201Hg]": 910,
1066
+ "[MnH+]": 911,
1067
+ "[201Tl]": 912,
1068
+ "[51Cr+3]": 913,
1069
+ "[123I-]": 914,
1070
+ "[MoH]": 915,
1071
+ "[AlH6-3]": 916,
1072
+ "[MnH2]": 917,
1073
+ "[WH3]": 918,
1074
+ "[213Bi+3]": 919,
1075
+ "[SnH2+2]": 920,
1076
+ "[123IH]": 921,
1077
+ "[13CH+]": 922,
1078
+ "[Zr-]": 923,
1079
+ "[74As]": 924,
1080
+ "[13C+]": 925,
1081
+ "[32P+]": 926,
1082
+ "[KrH]": 927,
1083
+ "[SiH+2]": 928,
1084
+ "[ClH3+2]": 929,
1085
+ "[13NH]": 930,
1086
+ "[9CH2]": 931,
1087
+ "[ZrH2+2]": 932,
1088
+ "[87Sr+2]": 933,
1089
+ "[35s]": 934,
1090
+ "[239Pu]": 935,
1091
+ "[198Au]": 936,
1092
+ "[241Am]": 937,
1093
+ "[203Hg+2]": 938,
1094
+ "[V+]": 939,
1095
+ "[YH2]": 940,
1096
+ "[195Pt]": 941,
1097
+ "[203Pb]": 942,
1098
+ "[RuH4]": 943,
1099
+ "[ThH2]": 944,
1100
+ "[AuH]": 945,
1101
+ "[66Ga+3]": 946,
1102
+ "[11B-]": 947,
1103
+ "[F]": 948,
1104
+ "[24Na+]": 949,
1105
+ "[85Sr+2]": 950,
1106
+ "[201Tl+]": 951,
1107
+ "[14CH4]": 952,
1108
+ "[32S]": 953,
1109
+ "[TeH2+]": 954,
1110
+ "[ClH2+3]": 955,
1111
+ "[AgH]": 956,
1112
+ "[Ge@H]": 957,
1113
+ "[44Ca+2]": 958,
1114
+ "[Os-]": 959,
1115
+ "[31P]": 960,
1116
+ "[15nH+]": 961,
1117
+ "[SbH4]": 962,
1118
+ "[TiH+]": 963,
1119
+ "[Ba+]": 964,
1120
+ "[57Co+2]": 965,
1121
+ "[Ta+]": 966,
1122
+ "[125IH]": 967,
1123
+ "[77As]": 968,
1124
+ "[129I]": 969,
1125
+ "[Fe-4]": 970,
1126
+ "[Ta-2]": 971,
1127
+ "[19O]": 972,
1128
+ "[12O]": 973,
1129
+ "[BiH3]": 974,
1130
+ "[237Np]": 975,
1131
+ "[252Cf]": 976,
1132
+ "[86Y]": 977,
1133
+ "[Cr-2]": 978,
1134
+ "[89Y]": 979,
1135
+ "[195Pt+2]": 980,
1136
+ "[si+2]": 981,
1137
+ "[58Fe+2]": 982,
1138
+ "[Hs]": 983,
1139
+ "[S@@H]": 984,
1140
+ "[8CH4]": 985,
1141
+ "[164Dy+3]": 986,
1142
+ "[47Ca+2]": 987,
1143
+ "[57Co]": 988,
1144
+ "[NbH2]": 989,
1145
+ "[ReH2]": 990,
1146
+ "[ZnH2]": 991,
1147
+ "[CrH2]": 992,
1148
+ "[17NH]": 993,
1149
+ "[ZrH3]": 994,
1150
+ "[RhH3]": 995,
1151
+ "[12C-]": 996,
1152
+ "[18O+]": 997,
1153
+ "[Bi-2]": 998,
1154
+ "[ClH4+3]": 999,
1155
+ "[Ni-3]": 1000,
1156
+ "[Ag-]": 1001,
1157
+ "[111In-]": 1002,
1158
+ "[Mo-2]": 1003,
1159
+ "[55Fe+3]": 1004,
1160
+ "[204Hg+]": 1005,
1161
+ "[35Cl-]": 1006,
1162
+ "[211Pb]": 1007,
1163
+ "[75Ge]": 1008,
1164
+ "[8B]": 1009,
1165
+ "[TeH3]": 1010,
1166
+ "[SnH3+]": 1011,
1167
+ "[Zr-3]": 1012,
1168
+ "[28F]": 1013,
1169
+ "[249Bk]": 1014,
1170
+ "[169Yb]": 1015,
1171
+ "[34SH]": 1016,
1172
+ "[6Li]": 1017,
1173
+ "[94Tc]": 1018,
1174
+ "[197Au]": 1019,
1175
+ "[195Pt+4]": 1020,
1176
+ "[169Yb+3]": 1021,
1177
+ "[32Cl]": 1022,
1178
+ "[82Se]": 1023,
1179
+ "[159Gd+3]": 1024,
1180
+ "[213Bi]": 1025,
1181
+ "[CoH+2]": 1026,
1182
+ "[36S]": 1027,
1183
+ "[35P]": 1028,
1184
+ "[Ru-4]": 1029,
1185
+ "[Cr-3]": 1030,
1186
+ "[60Co]": 1031,
1187
+ "[1H+]": 1032,
1188
+ "[18CH2]": 1033,
1189
+ "[Cd-]": 1034,
1190
+ "[152Sm+3]": 1035,
1191
+ "[106Ru]": 1036,
1192
+ "[238Pu]": 1037,
1193
+ "[220Rn]": 1038,
1194
+ "[45Ca+2]": 1039,
1195
+ "[89Sr+2]": 1040,
1196
+ "[239Np]": 1041,
1197
+ "[90Sr+2]": 1042,
1198
+ "[137Cs+]": 1043,
1199
+ "[165Dy]": 1044,
1200
+ "[68GaH3]": 1045,
1201
+ "[65Zn+2]": 1046,
1202
+ "[89Zr]": 1047,
1203
+ "[BiH2+2]": 1048,
1204
+ "[62Cu]": 1049,
1205
+ "[165Dy+3]": 1050,
1206
+ "[238U]": 1051,
1207
+ "[105Rh+3]": 1052,
1208
+ "[70Zn]": 1053,
1209
+ "[12B]": 1054,
1210
+ "[12OH]": 1055,
1211
+ "[18CH]": 1056,
1212
+ "[17CH]": 1057,
1213
+ "[42K]": 1058,
1214
+ "[76Br-]": 1059,
1215
+ "[71As]": 1060,
1216
+ "[NbH3]": 1061,
1217
+ "[ReH3]": 1062,
1218
+ "[OsH-]": 1063,
1219
+ "[WH4]": 1064,
1220
+ "[MoH3]": 1065,
1221
+ "[OsH4]": 1066,
1222
+ "[RuH6]": 1067,
1223
+ "[PtH3]": 1068,
1224
+ "[CuH2]": 1069,
1225
+ "[CoH3]": 1070,
1226
+ "[TiH4]": 1071,
1227
+ "[64Zn+2]": 1072,
1228
+ "[Si-2]": 1073,
1229
+ "[79BrH]": 1074,
1230
+ "[14CH2-]": 1075,
1231
+ "[PtH2+2]": 1076,
1232
+ "[Os-3]": 1077,
1233
+ "[29Si]": 1078,
1234
+ "[Ti-]": 1079,
1235
+ "[Se+6]": 1080,
1236
+ "[22Na+]": 1081,
1237
+ "[42K+]": 1082,
1238
+ "[131Cs+]": 1083,
1239
+ "[86Rb+]": 1084,
1240
+ "[134Cs+]": 1085,
1241
+ "[209Po]": 1086,
1242
+ "[208Po]": 1087,
1243
+ "[81Rb+]": 1088,
1244
+ "[203Tl+]": 1089,
1245
+ "[Zr-4]": 1090,
1246
+ "[148Sm]": 1091,
1247
+ "[147Sm]": 1092,
1248
+ "[37Cl-]": 1093,
1249
+ "[12CH4]": 1094,
1250
+ "[Ge@@H]": 1095,
1251
+ "[63Cu]": 1096,
1252
+ "[13CH2+]": 1097,
1253
+ "[AsH2-]": 1098,
1254
+ "[CeH]": 1099,
1255
+ "[SnH-]": 1100,
1256
+ "[UH]": 1101,
1257
+ "[9c]": 1102,
1258
+ "[21CH3]": 1103,
1259
+ "[TeH+]": 1104,
1260
+ "[57Co+3]": 1105,
1261
+ "[8BH2]": 1106,
1262
+ "[12BH2]": 1107,
1263
+ "[19BH2]": 1108,
1264
+ "[9BH2]": 1109,
1265
+ "[YbH2]": 1110,
1266
+ "[CrH+2]": 1111,
1267
+ "[208Bi]": 1112,
1268
+ "[152Gd]": 1113,
1269
+ "[61Cu]": 1114,
1270
+ "[115In]": 1115,
1271
+ "[60Co+2]": 1116,
1272
+ "[13NH2-]": 1117,
1273
+ "[120I]": 1118,
1274
+ "[18OH2]": 1119,
1275
+ "[75SeH]": 1120,
1276
+ "[SbH2+]": 1121,
1277
+ "[144Ce]": 1122,
1278
+ "[16n]": 1123,
1279
+ "[113In]": 1124,
1280
+ "[22nH]": 1125,
1281
+ "[129I-]": 1126,
1282
+ "[InH3]": 1127,
1283
+ "[32PH3]": 1128,
1284
+ "[234U]": 1129,
1285
+ "[235U]": 1130,
1286
+ "[59Fe]": 1131,
1287
+ "[82Rb+]": 1132,
1288
+ "[65Zn]": 1133,
1289
+ "[244Cm]": 1134,
1290
+ "[147Pm]": 1135,
1291
+ "[91Y]": 1136,
1292
+ "[237Pu]": 1137,
1293
+ "[231Pa]": 1138,
1294
+ "[253Cf]": 1139,
1295
+ "[127Te]": 1140,
1296
+ "[187Re]": 1141,
1297
+ "[236Np]": 1142,
1298
+ "[235Np]": 1143,
1299
+ "[72Zn]": 1144,
1300
+ "[253Es]": 1145,
1301
+ "[159Dy]": 1146,
1302
+ "[62Zn]": 1147,
1303
+ "[101Tc]": 1148,
1304
+ "[149Tb]": 1149,
1305
+ "[124I-]": 1150,
1306
+ "[SeH3+]": 1151,
1307
+ "[210Pb]": 1152,
1308
+ "[40K]": 1153,
1309
+ "[210Po]": 1154,
1310
+ "[214Pb]": 1155,
1311
+ "[218Po]": 1156,
1312
+ "[214Po]": 1157,
1313
+ "[7Be]": 1158,
1314
+ "[212Pb]": 1159,
1315
+ "[205Pb]": 1160,
1316
+ "[209Pb]": 1161,
1317
+ "[123Te]": 1162,
1318
+ "[202Pb]": 1163,
1319
+ "[72As]": 1164,
1320
+ "[201Pb]": 1165,
1321
+ "[70As]": 1166,
1322
+ "[73Ge]": 1167,
1323
+ "[200Pb]": 1168,
1324
+ "[198Pb]": 1169,
1325
+ "[66Ga]": 1170,
1326
+ "[73Se]": 1171,
1327
+ "[195Pb]": 1172,
1328
+ "[199Pb]": 1173,
1329
+ "[144Ce+3]": 1174,
1330
+ "[235U+2]": 1175,
1331
+ "[90Tc]": 1176,
1332
+ "[114In+3]": 1177,
1333
+ "[128I]": 1178,
1334
+ "[100Tc+]": 1179,
1335
+ "[82Br-]": 1180,
1336
+ "[191Pt+2]": 1181,
1337
+ "[191Pt+4]": 1182,
1338
+ "[193Pt+4]": 1183,
1339
+ "[31PH3]": 1184,
1340
+ "[125I+2]": 1185,
1341
+ "[131I+2]": 1186,
1342
+ "[125Te+4]": 1187,
1343
+ "[82Sr+2]": 1188,
1344
+ "[149Sm]": 1189,
1345
+ "[81BrH]": 1190,
1346
+ "[129Xe]": 1191,
1347
+ "[193Pt+2]": 1192,
1348
+ "[123I+2]": 1193,
1349
+ "[Cr-]": 1194,
1350
+ "[Co-]": 1195,
1351
+ "[227Th+4]": 1196,
1352
+ "[249Cf+3]": 1197,
1353
+ "[252Cf+3]": 1198,
1354
+ "[187Os]": 1199,
1355
+ "[16O-]": 1200,
1356
+ "[17O+]": 1201,
1357
+ "[16OH-]": 1202,
1358
+ "[98Tc+7]": 1203,
1359
+ "[58Co+2]": 1204,
1360
+ "[69Ga+3]": 1205,
1361
+ "[57Fe+2]": 1206,
1362
+ "[43K+]": 1207,
1363
+ "[16C]": 1208,
1364
+ "[52Fe+3]": 1209,
1365
+ "[SeH5]": 1210,
1366
+ "[194Pb]": 1211,
1367
+ "[196Pb]": 1212,
1368
+ "[197Pb]": 1213,
1369
+ "[213Pb]": 1214,
1370
+ "[9B]": 1215,
1371
+ "[19B]": 1216,
1372
+ "[11CH-]": 1217,
1373
+ "[9CH]": 1218,
1374
+ "[20OH]": 1219,
1375
+ "[25OH]": 1220,
1376
+ "[8cH]": 1221,
1377
+ "[TiH+3]": 1222,
1378
+ "[SnH6+3]": 1223,
1379
+ "[N@H+]": 1224,
1380
+ "[52Mn+2]": 1225,
1381
+ "[64Ga]": 1226,
1382
+ "[13B]": 1227,
1383
+ "[216Bi]": 1228,
1384
+ "[117Sn+2]": 1229,
1385
+ "[232Th]": 1230,
1386
+ "[SnH+2]": 1231,
1387
+ "[BiH5]": 1232,
1388
+ "[77Kr]": 1233,
1389
+ "[103Cd]": 1234,
1390
+ "[62Ni]": 1235,
1391
+ "[LaH3]": 1236,
1392
+ "[SmH3]": 1237,
1393
+ "[EuH3]": 1238,
1394
+ "[MoH5]": 1239,
1395
+ "[64Ni]": 1240,
1396
+ "[66Zn]": 1241,
1397
+ "[68Zn]": 1242,
1398
+ "[186W]": 1243,
1399
+ "[FeH4]": 1244,
1400
+ "[MoH4]": 1245,
1401
+ "[HgH2]": 1246,
1402
+ "[15NH2-]": 1247,
1403
+ "[UH2]": 1248,
1404
+ "[204Hg]": 1249,
1405
+ "[GaH4-]": 1250,
1406
+ "[ThH4]": 1251,
1407
+ "[WH6]": 1252,
1408
+ "[PtH4]": 1253,
1409
+ "[VH2]": 1254,
1410
+ "[UH3]": 1255,
1411
+ "[FeH3]": 1256,
1412
+ "[RuH5]": 1257,
1413
+ "[BiH4]": 1258,
1414
+ "[80Br-]": 1259,
1415
+ "[CeH3]": 1260,
1416
+ "[37ClH]": 1261,
1417
+ "[157Gd+3]": 1262,
1418
+ "[205Tl]": 1263,
1419
+ "[203Tl]": 1264,
1420
+ "[62Cu+]": 1265,
1421
+ "[64Cu+]": 1266,
1422
+ "[61Cu+]": 1267,
1423
+ "[37SH2]": 1268,
1424
+ "[30Si]": 1269,
1425
+ "[28Al]": 1270,
1426
+ "[19OH2]": 1271,
1427
+ "[8He]": 1272,
1428
+ "[6He]": 1273,
1429
+ "[153Pm]": 1274,
1430
+ "[209Bi]": 1275,
1431
+ "[66Zn+2]": 1276,
1432
+ "[10CH4]": 1277,
1433
+ "[191Ir]": 1278,
1434
+ "[66Cu]": 1279,
1435
+ "[16O+]": 1280,
1436
+ "[25O]": 1281,
1437
+ "[10c]": 1282,
1438
+ "[Co-3]": 1283,
1439
+ "[Sn@@]": 1284,
1440
+ "[17OH-]": 1285,
1441
+ "[206Po]": 1286,
1442
+ "[204Po]": 1287,
1443
+ "[202Po]": 1288,
1444
+ "[201Po]": 1289,
1445
+ "[200Po]": 1290,
1446
+ "[199Po]": 1291,
1447
+ "[198Po]": 1292,
1448
+ "[197Po]": 1293,
1449
+ "[196Po]": 1294,
1450
+ "[195Po]": 1295,
1451
+ "[194Po]": 1296,
1452
+ "[193Po]": 1297,
1453
+ "[192Po]": 1298,
1454
+ "[191Po]": 1299,
1455
+ "[190Po]": 1300,
1456
+ "[217Po]": 1301,
1457
+ "[BiH4-]": 1302,
1458
+ "[TeH4]": 1303,
1459
+ "[222Ra]": 1304,
1460
+ "[62Ga]": 1305,
1461
+ "[39Ar]": 1306,
1462
+ "[144Sm]": 1307,
1463
+ "[58Fe]": 1308,
1464
+ "[153Eu]": 1309,
1465
+ "[85Rb]": 1310,
1466
+ "[171Yb]": 1311,
1467
+ "[172Yb]": 1312,
1468
+ "[114Cd]": 1313,
1469
+ "[51Fe]": 1314,
1470
+ "[142Ce]": 1315,
1471
+ "[207Tl]": 1316,
1472
+ "[92Mo]": 1317,
1473
+ "[115Sn]": 1318,
1474
+ "[140Ce]": 1319,
1475
+ "[202Hg]": 1320,
1476
+ "[180W]": 1321,
1477
+ "[182W]": 1322,
1478
+ "[183W]": 1323,
1479
+ "[184W]": 1324,
1480
+ "[96Mo]": 1325,
1481
+ "[47Ti]": 1326,
1482
+ "[111Cd]": 1327,
1483
+ "[143Nd]": 1328,
1484
+ "[145Nd]": 1329,
1485
+ "[126Te]": 1330,
1486
+ "[128Te]": 1331,
1487
+ "[130Te]": 1332,
1488
+ "[185Re]": 1333,
1489
+ "[97Mo]": 1334,
1490
+ "[98Mo]": 1335,
1491
+ "[183Re]": 1336,
1492
+ "[52V]": 1337,
1493
+ "[80Se]": 1338,
1494
+ "[87Kr]": 1339,
1495
+ "[137Xe]": 1340,
1496
+ "[196Au]": 1341,
1497
+ "[146Ce]": 1342,
1498
+ "[88Kr]": 1343,
1499
+ "[51Ti]": 1344,
1500
+ "[138Xe]": 1345,
1501
+ "[112Cd]": 1346,
1502
+ "[116Sn]": 1347,
1503
+ "[120Sn]": 1348,
1504
+ "[28SiH3]": 1349,
1505
+ "[35S-]": 1350,
1506
+ "[15NH-]": 1351,
1507
+ "[13CH3+]": 1352,
1508
+ "[34S+]": 1353,
1509
+ "[34s]": 1354,
1510
+ "[SiH4-]": 1355,
1511
+ "[100Tc+5]": 1356,
1512
+ "[NiH2+2]": 1357,
1513
+ "[239Th]": 1358,
1514
+ "[186Lu]": 1359,
1515
+ "[AuH3]": 1360,
1516
+ "[I@@-]": 1361,
1517
+ "[XeH2]": 1362,
1518
+ "[B+]": 1363,
1519
+ "[16CH2]": 1364,
1520
+ "[8C]": 1365,
1521
+ "[TaH5]": 1366,
1522
+ "[FeH4-]": 1367,
1523
+ "[19C@H]": 1368,
1524
+ "[10NH]": 1369,
1525
+ "[FeH6-3]": 1370,
1526
+ "[22CH]": 1371,
1527
+ "[25N]": 1372,
1528
+ "[25N+]": 1373,
1529
+ "[25N-]": 1374,
1530
+ "[21CH2]": 1375,
1531
+ "[18cH]": 1376,
1532
+ "[113I]": 1377,
1533
+ "[ScH3]": 1378,
1534
+ "[30PH3]": 1379,
1535
+ "[43Ca+2]": 1380,
1536
+ "[41Ca+2]": 1381,
1537
+ "[106Cd]": 1382,
1538
+ "[122Sn]": 1383,
1539
+ "[18CH3]": 1384,
1540
+ "[58Co+3]": 1385,
1541
+ "[98Tc+4]": 1386,
1542
+ "[70Ge]": 1387,
1543
+ "[76Ge]": 1388,
1544
+ "[108Cd]": 1389,
1545
+ "[116Cd]": 1390,
1546
+ "[130Xe]": 1391,
1547
+ "[94Mo]": 1392,
1548
+ "[124Sn]": 1393,
1549
+ "[186Os]": 1394,
1550
+ "[188Os]": 1395,
1551
+ "[190Os]": 1396,
1552
+ "[192Os]": 1397,
1553
+ "[106Pd]": 1398,
1554
+ "[110Pd]": 1399,
1555
+ "[120Te]": 1400,
1556
+ "[132Ba]": 1401,
1557
+ "[134Ba]": 1402,
1558
+ "[136Ba]": 1403,
1559
+ "[136Ce]": 1404,
1560
+ "[138Ce]": 1405,
1561
+ "[156Dy]": 1406,
1562
+ "[158Dy]": 1407,
1563
+ "[160Dy]": 1408,
1564
+ "[163Dy]": 1409,
1565
+ "[162Er]": 1410,
1566
+ "[164Er]": 1411,
1567
+ "[167Er]": 1412,
1568
+ "[176Hf]": 1413,
1569
+ "[26Mg]": 1414,
1570
+ "[144Nd]": 1415,
1571
+ "[150Nd]": 1416,
1572
+ "[41K]": 1417,
1573
+ "[46Ti]": 1418,
1574
+ "[48Ti]": 1419,
1575
+ "[49Ti]": 1420,
1576
+ "[50Ti]": 1421,
1577
+ "[170Yb]": 1422,
1578
+ "[173Yb]": 1423,
1579
+ "[91Zr]": 1424,
1580
+ "[92Zr]": 1425,
1581
+ "[96Zr]": 1426,
1582
+ "[34S-]": 1427,
1583
+ "[CuH2-]": 1428,
1584
+ "[38Cl]": 1429,
1585
+ "[25Mg]": 1430,
1586
+ "[51V]": 1431,
1587
+ "[93Nb]": 1432,
1588
+ "[95Mo]": 1433,
1589
+ "[45Sc]": 1434,
1590
+ "[123Sb]": 1435,
1591
+ "[139La]": 1436,
1592
+ "[9Be]": 1437,
1593
+ "[99Y+3]": 1438,
1594
+ "[99Y]": 1439,
1595
+ "[156Ho]": 1440,
1596
+ "[67Zn]": 1441,
1597
+ "[144Ce+4]": 1442,
1598
+ "[210Tl]": 1443,
1599
+ "[42Ca]": 1444,
1600
+ "[54Fe]": 1445,
1601
+ "[193Ir]": 1446,
1602
+ "[92Nb]": 1447,
1603
+ "[141Cs]": 1448,
1604
+ "[52Cr]": 1449,
1605
+ "[35ClH]": 1450,
1606
+ "[46Ca]": 1451,
1607
+ "[139Cs]": 1452,
1608
+ "[65Cu]": 1453,
1609
+ "[71Ga]": 1454,
1610
+ "[60Ni]": 1455,
1611
+ "[16NH3]": 1456,
1612
+ "[148Nd]": 1457,
1613
+ "[72Ge]": 1458,
1614
+ "[161Dy]": 1459,
1615
+ "[49Ca]": 1460,
1616
+ "[43Ca]": 1461,
1617
+ "[8Be]": 1462,
1618
+ "[48Ca]": 1463,
1619
+ "[44Ca]": 1464,
1620
+ "[120Xe]": 1465,
1621
+ "[80Rb]": 1466,
1622
+ "[215At]": 1467,
1623
+ "[180Re]": 1468,
1624
+ "[146Sm]": 1469,
1625
+ "[19Ne]": 1470,
1626
+ "[74Kr]": 1471,
1627
+ "[134La]": 1472,
1628
+ "[76Kr]": 1473,
1629
+ "[219Fr]": 1474,
1630
+ "[121Xe]": 1475,
1631
+ "[220Fr]": 1476,
1632
+ "[216At]": 1477,
1633
+ "[223Ac]": 1478,
1634
+ "[218At]": 1479,
1635
+ "[37Ar]": 1480,
1636
+ "[135I]": 1481,
1637
+ "[110Cd]": 1482,
1638
+ "[94Tc+7]": 1483,
1639
+ "[86Y+3]": 1484,
1640
+ "[135I-]": 1485,
1641
+ "[15O-2]": 1486,
1642
+ "[151Eu+3]": 1487,
1643
+ "[161Tb+3]": 1488,
1644
+ "[197Hg+2]": 1489,
1645
+ "[109Cd+2]": 1490,
1646
+ "[191Os+4]": 1491,
1647
+ "[170Tm+3]": 1492,
1648
+ "[205Bi+3]": 1493,
1649
+ "[233U+4]": 1494,
1650
+ "[126Sb+3]": 1495,
1651
+ "[127Sb+3]": 1496,
1652
+ "[132Cs+]": 1497,
1653
+ "[136Eu+3]": 1498,
1654
+ "[136Eu]": 1499,
1655
+ "[125Sn+4]": 1500,
1656
+ "[175Yb+3]": 1501,
1657
+ "[100Mo]": 1502,
1658
+ "[22Ne]": 1503,
1659
+ "[13c-]": 1504,
1660
+ "[13NH4+]": 1505,
1661
+ "[17C]": 1506,
1662
+ "[9C]": 1507,
1663
+ "[31S]": 1508,
1664
+ "[31SH]": 1509,
1665
+ "[133I]": 1510,
1666
+ "[126I]": 1511,
1667
+ "[36SH]": 1512,
1668
+ "[30S]": 1513,
1669
+ "[32SH]": 1514,
1670
+ "[19CH2]": 1515,
1671
+ "[19c]": 1516,
1672
+ "[18c]": 1517,
1673
+ "[15F]": 1518,
1674
+ "[10C]": 1519,
1675
+ "[RuH-]": 1520,
1676
+ "[62Zn+2]": 1521,
1677
+ "[32ClH]": 1522,
1678
+ "[33ClH]": 1523,
1679
+ "[78BrH]": 1524,
1680
+ "[12Li+]": 1525,
1681
+ "[12Li]": 1526,
1682
+ "[233Ra]": 1527,
1683
+ "[68Ge+4]": 1528,
1684
+ "[44Sc+3]": 1529,
1685
+ "[91Y+3]": 1530,
1686
+ "[106Ru+3]": 1531,
1687
+ "[PoH2]": 1532,
1688
+ "[AtH]": 1533,
1689
+ "[55Fe]": 1534,
1690
+ "[233U]": 1535,
1691
+ "[210PoH2]": 1536,
1692
+ "[230Th]": 1537,
1693
+ "[228Th]": 1538,
1694
+ "[222Rn]": 1539,
1695
+ "[35SH2]": 1540,
1696
+ "[227Th]": 1541,
1697
+ "[192Ir]": 1542,
1698
+ "[133Xe]": 1543,
1699
+ "[81Kr]": 1544,
1700
+ "[95Zr]": 1545,
1701
+ "[240Pu]": 1546,
1702
+ "[54Mn]": 1547,
1703
+ "[103Ru]": 1548,
1704
+ "[95Nb]": 1549,
1705
+ "[109Cd]": 1550,
1706
+ "[141Ce]": 1551,
1707
+ "[85Kr]": 1552,
1708
+ "[110Ag]": 1553,
1709
+ "[58Co]": 1554,
1710
+ "[241Pu]": 1555,
1711
+ "[234Th]": 1556,
1712
+ "[140La]": 1557,
1713
+ "[63Ni]": 1558,
1714
+ "[152Eu]": 1559,
1715
+ "[132IH]": 1560,
1716
+ "[226Rn]": 1561,
1717
+ "[154Eu]": 1562,
1718
+ "[36ClH]": 1563,
1719
+ "[228Ac]": 1564,
1720
+ "[155Eu]": 1565,
1721
+ "[106Rh]": 1566,
1722
+ "[243Am]": 1567,
1723
+ "[227Ac]": 1568,
1724
+ "[243Cm]": 1569,
1725
+ "[236U]": 1570,
1726
+ "[144Pr]": 1571,
1727
+ "[232U]": 1572,
1728
+ "[32SH2]": 1573,
1729
+ "[88Y]": 1574,
1730
+ "[82BrH]": 1575,
1731
+ "[135IH]": 1576,
1732
+ "[242Cm]": 1577,
1733
+ "[115Cd]": 1578,
1734
+ "[242Pu]": 1579,
1735
+ "[46Sc]": 1580,
1736
+ "[56Mn]": 1581,
1737
+ "[234Pa]": 1582,
1738
+ "[41Ar]": 1583,
1739
+ "[147Nd]": 1584,
1740
+ "[187W]": 1585,
1741
+ "[151Sm]": 1586,
1742
+ "[59Ni]": 1587,
1743
+ "[233Pa]": 1588,
1744
+ "[52Mn]": 1589,
1745
+ "[94Nb]": 1590,
1746
+ "[219Rn]": 1591,
1747
+ "[236Pu]": 1592,
1748
+ "[13NH3]": 1593,
1749
+ "[93Zr]": 1594,
1750
+ "[51Cr+6]": 1595,
1751
+ "[TlH3]": 1596,
1752
+ "[123Xe]": 1597,
1753
+ "[160Tb]": 1598,
1754
+ "[170Tm]": 1599,
1755
+ "[182Ta]": 1600,
1756
+ "[175Yb]": 1601,
1757
+ "[93Mo]": 1602,
1758
+ "[143Ce]": 1603,
1759
+ "[191Os]": 1604,
1760
+ "[126IH]": 1605,
1761
+ "[48V]": 1606,
1762
+ "[113Cd]": 1607,
1763
+ "[47Sc]": 1608,
1764
+ "[181Hf]": 1609,
1765
+ "[185W]": 1610,
1766
+ "[143Pr]": 1611,
1767
+ "[191Pt]": 1612,
1768
+ "[181W]": 1613,
1769
+ "[33PH3]": 1614,
1770
+ "[97Ru]": 1615,
1771
+ "[97Tc]": 1616,
1772
+ "[111Ag]": 1617,
1773
+ "[169Er]": 1618,
1774
+ "[107Pd]": 1619,
1775
+ "[103Ru+2]": 1620,
1776
+ "[34SH2]": 1621,
1777
+ "[137Ce]": 1622,
1778
+ "[242Am]": 1623,
1779
+ "[117SnH2]": 1624,
1780
+ "[57Ni]": 1625,
1781
+ "[239U]": 1626,
1782
+ "[60Cu]": 1627,
1783
+ "[250Cf]": 1628,
1784
+ "[193Au]": 1629,
1785
+ "[69Zn]": 1630,
1786
+ "[55Co]": 1631,
1787
+ "[139Ce]": 1632,
1788
+ "[127Xe]": 1633,
1789
+ "[159Gd]": 1634,
1790
+ "[56Co]": 1635,
1791
+ "[177Hf]": 1636,
1792
+ "[244Pu]": 1637,
1793
+ "[38ClH]": 1638,
1794
+ "[142Pr]": 1639,
1795
+ "[199Hg]": 1640,
1796
+ "[179Hf]": 1641,
1797
+ "[178Hf]": 1642,
1798
+ "[237U]": 1643,
1799
+ "[156Eu]": 1644,
1800
+ "[157Eu]": 1645,
1801
+ "[105Ru]": 1646,
1802
+ "[171Tm]": 1647,
1803
+ "[199Au]": 1648,
1804
+ "[155Sm]": 1649,
1805
+ "[80BrH]": 1650,
1806
+ "[108Ag]": 1651,
1807
+ "[128IH]": 1652,
1808
+ "[48Sc]": 1653,
1809
+ "[45Ti]": 1654,
1810
+ "[176Lu]": 1655,
1811
+ "[121SnH2]": 1656,
1812
+ "[148Pm]": 1657,
1813
+ "[57Fe]": 1658,
1814
+ "[10BH3]": 1659,
1815
+ "[96Tc]": 1660,
1816
+ "[133IH]": 1661,
1817
+ "[143Pm]": 1662,
1818
+ "[105Rh]": 1663,
1819
+ "[130IH]": 1664,
1820
+ "[134IH]": 1665,
1821
+ "[131IH]": 1666,
1822
+ "[71Zn]": 1667,
1823
+ "[105Ag]": 1668,
1824
+ "[97Zr]": 1669,
1825
+ "[235Pu]": 1670,
1826
+ "[231Th]": 1671,
1827
+ "[109Pd]": 1672,
1828
+ "[93Y]": 1673,
1829
+ "[190Ir]": 1674,
1830
+ "[135Xe]": 1675,
1831
+ "[53Mn]": 1676,
1832
+ "[134Ce]": 1677,
1833
+ "[234Np]": 1678,
1834
+ "[240Am]": 1679,
1835
+ "[246Cf]": 1680,
1836
+ "[240Cm]": 1681,
1837
+ "[241Cm]": 1682,
1838
+ "[226Th]": 1683,
1839
+ "[39ClH]": 1684,
1840
+ "[229Th]": 1685,
1841
+ "[245Cm]": 1686,
1842
+ "[240U]": 1687,
1843
+ "[240Np]": 1688,
1844
+ "[249Cm]": 1689,
1845
+ "[243Pu]": 1690,
1846
+ "[145Pm]": 1691,
1847
+ "[199Pt]": 1692,
1848
+ "[246Bk]": 1693,
1849
+ "[193Pt]": 1694,
1850
+ "[230U]": 1695,
1851
+ "[250Cm]": 1696,
1852
+ "[44Ti]": 1697,
1853
+ "[175Hf]": 1698,
1854
+ "[254Fm]": 1699,
1855
+ "[255Fm]": 1700,
1856
+ "[257Fm]": 1701,
1857
+ "[92Y]": 1702,
1858
+ "[188Ir]": 1703,
1859
+ "[171Lu]": 1704,
1860
+ "[257Md]": 1705,
1861
+ "[247Bk]": 1706,
1862
+ "[121IH]": 1707,
1863
+ "[250Bk]": 1708,
1864
+ "[179Lu]": 1709,
1865
+ "[224Ac]": 1710,
1866
+ "[195Hg]": 1711,
1867
+ "[244Am]": 1712,
1868
+ "[246Pu]": 1713,
1869
+ "[194Au]": 1714,
1870
+ "[252Fm]": 1715,
1871
+ "[173Hf]": 1716,
1872
+ "[246Cm]": 1717,
1873
+ "[135Ce]": 1718,
1874
+ "[49Cr]": 1719,
1875
+ "[248Cf]": 1720,
1876
+ "[247Cm]": 1721,
1877
+ "[248Cm]": 1722,
1878
+ "[174Ta]": 1723,
1879
+ "[176Ta]": 1724,
1880
+ "[154Tb]": 1725,
1881
+ "[172Ta]": 1726,
1882
+ "[177Ta]": 1727,
1883
+ "[175Ta]": 1728,
1884
+ "[180Ta]": 1729,
1885
+ "[158Tb]": 1730,
1886
+ "[115Ag]": 1731,
1887
+ "[189Os]": 1732,
1888
+ "[251Cf]": 1733,
1889
+ "[145Pr]": 1734,
1890
+ "[147Pr]": 1735,
1891
+ "[76BrH]": 1736,
1892
+ "[102Rh]": 1737,
1893
+ "[238Np]": 1738,
1894
+ "[185Os]": 1739,
1895
+ "[246Am]": 1740,
1896
+ "[233Np]": 1741,
1897
+ "[166Dy]": 1742,
1898
+ "[254Es]": 1743,
1899
+ "[244Cf]": 1744,
1900
+ "[193Os]": 1745,
1901
+ "[245Am]": 1746,
1902
+ "[245Bk]": 1747,
1903
+ "[239Am]": 1748,
1904
+ "[238Am]": 1749,
1905
+ "[97Nb]": 1750,
1906
+ "[245Pu]": 1751,
1907
+ "[254Cf]": 1752,
1908
+ "[188W]": 1753,
1909
+ "[250Es]": 1754,
1910
+ "[251Es]": 1755,
1911
+ "[237Am]": 1756,
1912
+ "[182Hf]": 1757,
1913
+ "[258Md]": 1758,
1914
+ "[232Np]": 1759,
1915
+ "[238Cm]": 1760,
1916
+ "[60Fe]": 1761,
1917
+ "[109Pd+2]": 1762,
1918
+ "[234Pu]": 1763,
1919
+ "[141Ce+3]": 1764,
1920
+ "[136Nd]": 1765,
1921
+ "[136Pr]": 1766,
1922
+ "[173Ta]": 1767,
1923
+ "[110Ru]": 1768,
1924
+ "[147Tb]": 1769,
1925
+ "[253Fm]": 1770,
1926
+ "[139Nd]": 1771,
1927
+ "[178Re]": 1772,
1928
+ "[177Re]": 1773,
1929
+ "[200Au]": 1774,
1930
+ "[182Re]": 1775,
1931
+ "[156Tb]": 1776,
1932
+ "[155Tb]": 1777,
1933
+ "[157Tb]": 1778,
1934
+ "[161Tb]": 1779,
1935
+ "[161Ho]": 1780,
1936
+ "[167Tm]": 1781,
1937
+ "[173Lu]": 1782,
1938
+ "[179Ta]": 1783,
1939
+ "[171Er]": 1784,
1940
+ "[44Sc]": 1785,
1941
+ "[49Sc]": 1786,
1942
+ "[49V]": 1787,
1943
+ "[51Mn]": 1788,
1944
+ "[90Nb]": 1789,
1945
+ "[88Nb]": 1790,
1946
+ "[88Zr]": 1791,
1947
+ "[36SH2]": 1792,
1948
+ "[174Yb]": 1793,
1949
+ "[178Lu]": 1794,
1950
+ "[179W]": 1795,
1951
+ "[83BrH]": 1796,
1952
+ "[107Cd]": 1797,
1953
+ "[75BrH]": 1798,
1954
+ "[62Co]": 1799,
1955
+ "[48Cr]": 1800,
1956
+ "[63Zn]": 1801,
1957
+ "[102Ag]": 1802,
1958
+ "[154Sm]": 1803,
1959
+ "[168Er]": 1804,
1960
+ "[65Ni]": 1805,
1961
+ "[137La]": 1806,
1962
+ "[187Ir]": 1807,
1963
+ "[144Pm]": 1808,
1964
+ "[146Pm]": 1809,
1965
+ "[160Gd]": 1810,
1966
+ "[166Yb]": 1811,
1967
+ "[162Dy]": 1812,
1968
+ "[47V]": 1813,
1969
+ "[141Nd]": 1814,
1970
+ "[141Sm]": 1815,
1971
+ "[166Er]": 1816,
1972
+ "[150Sm]": 1817,
1973
+ "[146Eu]": 1818,
1974
+ "[149Eu]": 1819,
1975
+ "[174Lu]": 1820,
1976
+ "[17NH3]": 1821,
1977
+ "[102Ru]": 1822,
1978
+ "[170Hf]": 1823,
1979
+ "[188Pt]": 1824,
1980
+ "[61Ni]": 1825,
1981
+ "[56Ni]": 1826,
1982
+ "[149Gd]": 1827,
1983
+ "[151Gd]": 1828,
1984
+ "[141Pm]": 1829,
1985
+ "[147Gd]": 1830,
1986
+ "[146Gd]": 1831,
1987
+ "[161Er]": 1832,
1988
+ "[103Ag]": 1833,
1989
+ "[145Eu]": 1834,
1990
+ "[153Tb]": 1835,
1991
+ "[155Dy]": 1836,
1992
+ "[184Re]": 1837,
1993
+ "[180Os]": 1838,
1994
+ "[182Os]": 1839,
1995
+ "[186Pt]": 1840,
1996
+ "[181Os]": 1841,
1997
+ "[181Re]": 1842,
1998
+ "[151Tb]": 1843,
1999
+ "[178Ta]": 1844,
2000
+ "[178W]": 1845,
2001
+ "[189Pt]": 1846,
2002
+ "[194Hg]": 1847,
2003
+ "[145Sm]": 1848,
2004
+ "[150Tb]": 1849,
2005
+ "[132La]": 1850,
2006
+ "[158Gd]": 1851,
2007
+ "[104Ag]": 1852,
2008
+ "[193Hg]": 1853,
2009
+ "[94Ru]": 1854,
2010
+ "[137Pr]": 1855,
2011
+ "[155Ho]": 1856,
2012
+ "[117Cd]": 1857,
2013
+ "[99Ru]": 1858,
2014
+ "[146Nd]": 1859,
2015
+ "[218Rn]": 1860,
2016
+ "[95Y]": 1861,
2017
+ "[79Kr]": 1862,
2018
+ "[120IH]": 1863,
2019
+ "[138Pr]": 1864,
2020
+ "[100Pd]": 1865,
2021
+ "[166Tm]": 1866,
2022
+ "[90Mo]": 1867,
2023
+ "[151Nd]": 1868,
2024
+ "[231U]": 1869,
2025
+ "[138Nd]": 1870,
2026
+ "[89Nb]": 1871,
2027
+ "[98Nb]": 1872,
2028
+ "[162Ho]": 1873,
2029
+ "[142Sm]": 1874,
2030
+ "[186Ta]": 1875,
2031
+ "[104Tc]": 1876,
2032
+ "[184Ta]": 1877,
2033
+ "[185Ta]": 1878,
2034
+ "[170Er]": 1879,
2035
+ "[107Rh]": 1880,
2036
+ "[131La]": 1881,
2037
+ "[169Lu]": 1882,
2038
+ "[74BrH]": 1883,
2039
+ "[150Pm]": 1884,
2040
+ "[172Tm]": 1885,
2041
+ "[197Pt]": 1886,
2042
+ "[230Pu]": 1887,
2043
+ "[170Lu]": 1888,
2044
+ "[86Zr]": 1889,
2045
+ "[176W]": 1890,
2046
+ "[177W]": 1891,
2047
+ "[101Pd]": 1892,
2048
+ "[105Pd]": 1893,
2049
+ "[108Pd]": 1894,
2050
+ "[149Nd]": 1895,
2051
+ "[164Ho]": 1896,
2052
+ "[159Ho]": 1897,
2053
+ "[167Ho]": 1898,
2054
+ "[176Yb]": 1899,
2055
+ "[156Sm]": 1900,
2056
+ "[77BrH]": 1901,
2057
+ "[189Re]": 1902,
2058
+ "[99Rh]": 1903,
2059
+ "[100Rh]": 1904,
2060
+ "[151Pm]": 1905,
2061
+ "[232Pa]": 1906,
2062
+ "[228Pa]": 1907,
2063
+ "[230Pa]": 1908,
2064
+ "[66Ni]": 1909,
2065
+ "[194Os]": 1910,
2066
+ "[135La]": 1911,
2067
+ "[138La]": 1912,
2068
+ "[141La]": 1913,
2069
+ "[142La]": 1914,
2070
+ "[195Ir]": 1915,
2071
+ "[96Nb]": 1916,
2072
+ "[157Ho]": 1917,
2073
+ "[183Hf]": 1918,
2074
+ "[162Tm]": 1919,
2075
+ "[172Er]": 1920,
2076
+ "[148Eu]": 1921,
2077
+ "[150Eu]": 1922,
2078
+ "[15CH4]": 1923,
2079
+ "[89Kr]": 1924,
2080
+ "[143La]": 1925,
2081
+ "[58Ni]": 1926,
2082
+ "[61Co]": 1927,
2083
+ "[158Eu]": 1928,
2084
+ "[165Er]": 1929,
2085
+ "[167Yb]": 1930,
2086
+ "[173Tm]": 1931,
2087
+ "[175Tm]": 1932,
2088
+ "[172Hf]": 1933,
2089
+ "[172Lu]": 1934,
2090
+ "[93Tc]": 1935,
2091
+ "[177Yb]": 1936,
2092
+ "[124IH]": 1937,
2093
+ "[194Ir]": 1938,
2094
+ "[147Eu]": 1939,
2095
+ "[101Mo]": 1940,
2096
+ "[180Hf]": 1941,
2097
+ "[189Ir]": 1942,
2098
+ "[87Y]": 1943,
2099
+ "[43Sc]": 1944,
2100
+ "[195Au]": 1945,
2101
+ "[112Ag]": 1946,
2102
+ "[84BrH]": 1947,
2103
+ "[106Ag]": 1948,
2104
+ "[109Ag]": 1949,
2105
+ "[101Rh]": 1950,
2106
+ "[162Yb]": 1951,
2107
+ "[228Rn]": 1952,
2108
+ "[139Pr]": 1953,
2109
+ "[94Y]": 1954,
2110
+ "[201Au]": 1955,
2111
+ "[40PH3]": 1956,
2112
+ "[110Ag+]": 1957,
2113
+ "[104Cd]": 1958,
2114
+ "[133Ba+2]": 1959,
2115
+ "[226Ac]": 1960,
2116
+ "[145Gd]": 1961,
2117
+ "[186Ir]": 1962,
2118
+ "[184Ir]": 1963,
2119
+ "[224Rn]": 1964,
2120
+ "[185Ir]": 1965,
2121
+ "[182Ir]": 1966,
2122
+ "[184Hf]": 1967,
2123
+ "[200Pt]": 1968,
2124
+ "[227Pa]": 1969,
2125
+ "[178Yb]": 1970,
2126
+ "[72Br-]": 1971,
2127
+ "[72BrH]": 1972,
2128
+ "[248Am]": 1973,
2129
+ "[238Th]": 1974,
2130
+ "[161Gd]": 1975,
2131
+ "[35S-2]": 1976,
2132
+ "[107Ag]": 1977,
2133
+ "[FeH6-4]": 1978,
2134
+ "[89Sr]": 1979,
2135
+ "[SnH3-]": 1980,
2136
+ "[SeH3]": 1981,
2137
+ "[TeH3+]": 1982,
2138
+ "[SbH4+]": 1983,
2139
+ "[AsH4+]": 1984,
2140
+ "[4He]": 1985,
2141
+ "[AsH3-]": 1986,
2142
+ "[1HH]": 1987,
2143
+ "[3H+]": 1988,
2144
+ "[82Rb]": 1989,
2145
+ "[85Sr]": 1990,
2146
+ "[90Sr]": 1991,
2147
+ "[137Cs]": 1992,
2148
+ "[133Ba]": 1993,
2149
+ "[131Cs]": 1994,
2150
+ "[SbH5]": 1995,
2151
+ "[224Ra]": 1996,
2152
+ "[22Na]": 1997,
2153
+ "[210Bi]": 1998,
2154
+ "[214Bi]": 1999,
2155
+ "[228Ra]": 2000,
2156
+ "[127Sb]": 2001,
2157
+ "[136Cs]": 2002,
2158
+ "[125Sb]": 2003,
2159
+ "[134Cs]": 2004,
2160
+ "[140Ba]": 2005,
2161
+ "[45Ca]": 2006,
2162
+ "[206Pb]": 2007,
2163
+ "[207Pb]": 2008,
2164
+ "[24Na]": 2009,
2165
+ "[86Rb]": 2010,
2166
+ "[212Bi]": 2011,
2167
+ "[208Pb]": 2012,
2168
+ "[124Sb]": 2013,
2169
+ "[204Pb]": 2014,
2170
+ "[44K]": 2015,
2171
+ "[129Te]": 2016,
2172
+ "[113Sn]": 2017,
2173
+ "[204Tl]": 2018,
2174
+ "[87Sr]": 2019,
2175
+ "[208Tl]": 2020,
2176
+ "[87Rb]": 2021,
2177
+ "[47Ca]": 2022,
2178
+ "[135Cs]": 2023,
2179
+ "[216Po]": 2024,
2180
+ "[137Ba]": 2025,
2181
+ "[207Bi]": 2026,
2182
+ "[212Po]": 2027,
2183
+ "[79Se]": 2028,
2184
+ "[223Ra]": 2029,
2185
+ "[86Sr]": 2030,
2186
+ "[122Sb]": 2031,
2187
+ "[26Al]": 2032,
2188
+ "[32Si]": 2033,
2189
+ "[126Sn]": 2034,
2190
+ "[225Ra]": 2035,
2191
+ "[114In]": 2036,
2192
+ "[72Ga]": 2037,
2193
+ "[132Te]": 2038,
2194
+ "[10Be]": 2039,
2195
+ "[125Sn]": 2040,
2196
+ "[73As]": 2041,
2197
+ "[206Bi]": 2042,
2198
+ "[117Sn]": 2043,
2199
+ "[40Ca]": 2044,
2200
+ "[41Ca]": 2045,
2201
+ "[89Rb]": 2046,
2202
+ "[116In]": 2047,
2203
+ "[129Sb]": 2048,
2204
+ "[91Sr]": 2049,
2205
+ "[71Ge]": 2050,
2206
+ "[139Ba]": 2051,
2207
+ "[69Ga]": 2052,
2208
+ "[120Sb]": 2053,
2209
+ "[121Sn]": 2054,
2210
+ "[123Sn]": 2055,
2211
+ "[131Te]": 2056,
2212
+ "[77Ge]": 2057,
2213
+ "[135Ba]": 2058,
2214
+ "[82Sr]": 2059,
2215
+ "[43K]": 2060,
2216
+ "[131Ba]": 2061,
2217
+ "[92Sr]": 2062,
2218
+ "[88Rb]": 2063,
2219
+ "[129Cs]": 2064,
2220
+ "[144Cs]": 2065,
2221
+ "[127Cs]": 2066,
2222
+ "[200Tl]": 2067,
2223
+ "[202Tl]": 2068,
2224
+ "[141Ba]": 2069,
2225
+ "[117Sb]": 2070,
2226
+ "[116Sb]": 2071,
2227
+ "[78As]": 2072,
2228
+ "[131Sb]": 2073,
2229
+ "[126Sb]": 2074,
2230
+ "[128Sb]": 2075,
2231
+ "[130Sb]": 2076,
2232
+ "[67Ge]": 2077,
2233
+ "[68Ge]": 2078,
2234
+ "[78Ge]": 2079,
2235
+ "[66Ge]": 2080,
2236
+ "[223Fr]": 2081,
2237
+ "[132Cs]": 2082,
2238
+ "[125Cs]": 2083,
2239
+ "[138Cs]": 2084,
2240
+ "[133Te]": 2085,
2241
+ "[84Rb]": 2086,
2242
+ "[83Rb]": 2087,
2243
+ "[81Rb]": 2088,
2244
+ "[142Ba]": 2089,
2245
+ "[200Bi]": 2090,
2246
+ "[115Sb]": 2091,
2247
+ "[194Tl]": 2092,
2248
+ "[70Se]": 2093,
2249
+ "[112In]": 2094,
2250
+ "[118Sb]": 2095,
2251
+ "[70Ga]": 2096,
2252
+ "[27Mg]": 2097,
2253
+ "[202Bi]": 2098,
2254
+ "[83Se]": 2099,
2255
+ "[9Li]": 2100,
2256
+ "[69As]": 2101,
2257
+ "[79Rb]": 2102,
2258
+ "[81Sr]": 2103,
2259
+ "[83Sr]": 2104,
2260
+ "[78Se]": 2105,
2261
+ "[109In]": 2106,
2262
+ "[29Al]": 2107,
2263
+ "[118Sn]": 2108,
2264
+ "[117In]": 2109,
2265
+ "[119Sb]": 2110,
2266
+ "[114Sn]": 2111,
2267
+ "[138Ba]": 2112,
2268
+ "[69Ge]": 2113,
2269
+ "[73Ga]": 2114,
2270
+ "[74Ge]": 2115,
2271
+ "[206Tl]": 2116,
2272
+ "[199Tl]": 2117,
2273
+ "[130Cs]": 2118,
2274
+ "[28Mg]": 2119,
2275
+ "[116Te]": 2120,
2276
+ "[112Sn]": 2121,
2277
+ "[126Ba]": 2122,
2278
+ "[211Bi]": 2123,
2279
+ "[81Se]": 2124,
2280
+ "[127Sn]": 2125,
2281
+ "[143Cs]": 2126,
2282
+ "[134Te]": 2127,
2283
+ "[80Sr]": 2128,
2284
+ "[45K]": 2129,
2285
+ "[215Po]": 2130,
2286
+ "[207Po]": 2131,
2287
+ "[111Sn]": 2132,
2288
+ "[211Po]": 2133,
2289
+ "[128Ba]": 2134,
2290
+ "[198Tl]": 2135,
2291
+ "[227Ra]": 2136,
2292
+ "[213Po]": 2137,
2293
+ "[220Ra]": 2138,
2294
+ "[128Sn]": 2139,
2295
+ "[203Po]": 2140,
2296
+ "[205Po]": 2141,
2297
+ "[65Ga]": 2142,
2298
+ "[197Tl]": 2143,
2299
+ "[88Sr]": 2144,
2300
+ "[110In]": 2145,
2301
+ "[31Si]": 2146,
2302
+ "[201Bi]": 2147,
2303
+ "[121Te]": 2148,
2304
+ "[205Bi]": 2149,
2305
+ "[203Bi]": 2150,
2306
+ "[195Tl]": 2151,
2307
+ "[209Tl]": 2152,
2308
+ "[110Sn]": 2153,
2309
+ "[222Fr]": 2154,
2310
+ "[207At]": 2155,
2311
+ "[119In]": 2156,
2312
+ "[As@]": 2157,
2313
+ "[129IH]": 2158,
2314
+ "[157Dy]": 2159,
2315
+ "[111IH]": 2160,
2316
+ "[230Ra]": 2161,
2317
+ "[144Pr+3]": 2162,
2318
+ "[SiH3+]": 2163,
2319
+ "[3He]": 2164,
2320
+ "[AsH5]": 2165,
2321
+ "[72Se]": 2166,
2322
+ "[95Tc]": 2167,
2323
+ "[103Pd]": 2168,
2324
+ "[121Sn+2]": 2169,
2325
+ "[211Rn]": 2170,
2326
+ "[38SH2]": 2171,
2327
+ "[127IH]": 2172,
2328
+ "[74Br-]": 2173,
2329
+ "[133I-]": 2174,
2330
+ "[100Tc+4]": 2175,
2331
+ "[100Tc]": 2176,
2332
+ "[36Cl-]": 2177,
2333
+ "[89Y+3]": 2178,
2334
+ "[104Rh]": 2179,
2335
+ "[152Sm]": 2180,
2336
+ "[226Ra]": 2181,
2337
+ "[19FH]": 2182,
2338
+ "[104Pd]": 2183,
2339
+ "[148Gd]": 2184,
2340
+ "[157Lu]": 2185,
2341
+ "[33SH2]": 2186,
2342
+ "[121I-]": 2187,
2343
+ "[17FH]": 2188,
2344
+ "[71Se]": 2189,
2345
+ "[157Sm]": 2190,
2346
+ "[148Tb]": 2191,
2347
+ "[164Dy]": 2192,
2348
+ "[15OH2]": 2193,
2349
+ "[15O+]": 2194,
2350
+ "[39K]": 2195,
2351
+ "[40Ar]": 2196,
2352
+ "[50Cr+3]": 2197,
2353
+ "[50Cr]": 2198,
2354
+ "[52Ti]": 2199,
2355
+ "[103Pd+2]": 2200,
2356
+ "[130Ba]": 2201,
2357
+ "[142Pm]": 2202,
2358
+ "[153Gd+3]": 2203,
2359
+ "[151Eu]": 2204,
2360
+ "[103Rh]": 2205,
2361
+ "[124Xe]": 2206,
2362
+ "[152Tb]": 2207,
2363
+ "[17OH2]": 2208,
2364
+ "[20Ne]": 2209,
2365
+ "[52Fe]": 2210,
2366
+ "[94Zr+4]": 2211,
2367
+ "[94Zr]": 2212,
2368
+ "[149Pr]": 2213,
2369
+ "[16OH2]": 2214,
2370
+ "[53Cr+6]": 2215,
2371
+ "[53Cr]": 2216,
2372
+ "[81Br-]": 2217,
2373
+ "[112Pd]": 2218,
2374
+ "[125Xe]": 2219,
2375
+ "[155Gd]": 2220,
2376
+ "[157Gd]": 2221,
2377
+ "[168Yb]": 2222,
2378
+ "[184Os]": 2223,
2379
+ "[166Tb]": 2224,
2380
+ "[221Fr]": 2225,
2381
+ "[212Ra]": 2226,
2382
+ "[75Br-]": 2227,
2383
+ "[79Br-]": 2228,
2384
+ "[113Ag]": 2229,
2385
+ "[23Na]": 2230,
2386
+ "[34Cl-]": 2231,
2387
+ "[34ClH]": 2232,
2388
+ "[38Cl-]": 2233,
2389
+ "[56Fe]": 2234,
2390
+ "[68Cu]": 2235,
2391
+ "[77Br-]": 2236,
2392
+ "[90Zr+4]": 2237,
2393
+ "[90Zr]": 2238,
2394
+ "[102Pd]": 2239,
2395
+ "[154Eu+3]": 2240,
2396
+ "[57Mn]": 2241,
2397
+ "[165Tm]": 2242,
2398
+ "[152Dy]": 2243,
2399
+ "[217At]": 2244,
2400
+ "[77se]": 2245,
2401
+ "[13cH-]": 2246,
2402
+ "[122Te]": 2247,
2403
+ "[156Gd]": 2248,
2404
+ "[124Te]": 2249,
2405
+ "[53Ni]": 2250,
2406
+ "[131Xe]": 2251,
2407
+ "[174Hf+4]": 2252,
2408
+ "[174Hf]": 2253,
2409
+ "[76Se]": 2254,
2410
+ "[168Tm]": 2255,
2411
+ "[167Dy]": 2256,
2412
+ "[154Gd]": 2257,
2413
+ "[95Ru]": 2258,
2414
+ "[210At]": 2259,
2415
+ "[85Br]": 2260,
2416
+ "[59Co]": 2261,
2417
+ "[122Xe]": 2262,
2418
+ "[27Al]": 2263,
2419
+ "[54Cr]": 2264,
2420
+ "[198Hg]": 2265,
2421
+ "[85Rb+]": 2266,
2422
+ "[214Tl]": 2267,
2423
+ "[229Rn]": 2268,
2424
+ "[218Pb]": 2269,
2425
+ "[218Bi]": 2270,
2426
+ "[167Tm+3]": 2271,
2427
+ "[18o+]": 2272,
2428
+ "[P@@H+]": 2273,
2429
+ "[P@H+]": 2274,
2430
+ "[13N+]": 2275,
2431
+ "[212Pb+2]": 2276,
2432
+ "[217Bi]": 2277,
2433
+ "[249Cf+2]": 2278,
2434
+ "[18OH3+]": 2279,
2435
+ "[90Sr-]": 2280,
2436
+ "[Cf+3]": 2281,
2437
+ "[200Hg]": 2282,
2438
+ "[86Tc]": 2283,
2439
+ "[141Pr+3]": 2284,
2440
+ "[141Pr]": 2285,
2441
+ "[16nH]": 2286,
2442
+ "[14NH4+]": 2287,
2443
+ "[132Xe]": 2288,
2444
+ "[83Kr]": 2289,
2445
+ "[70Zn+2]": 2290,
2446
+ "[137Ba+2]": 2291,
2447
+ "[36Ar]": 2292,
2448
+ "[38Ar]": 2293,
2449
+ "[21Ne]": 2294,
2450
+ "[126Xe]": 2295,
2451
+ "[136Xe]": 2296,
2452
+ "[128Xe]": 2297,
2453
+ "[134Xe]": 2298,
2454
+ "[84Kr]": 2299,
2455
+ "[86Kr]": 2300,
2456
+ "[78Kr]": 2301,
2457
+ "[80Kr]": 2302,
2458
+ "[82Kr]": 2303,
2459
+ "[67Zn+2]": 2304,
2460
+ "[65Cu+2]": 2305,
2461
+ "[110Te]": 2306,
2462
+ "[58Fe+3]": 2307,
2463
+ "[142Nd]": 2308,
2464
+ "[38K]": 2309,
2465
+ "[198Au+3]": 2310,
2466
+ "[122IH]": 2311,
2467
+ "[38PH3]": 2312,
2468
+ "[130I-]": 2313,
2469
+ "[40K+]": 2314,
2470
+ "[38K+]": 2315,
2471
+ "[28Mg+2]": 2316,
2472
+ "[208Tl+]": 2317,
2473
+ "[13OH2]": 2318,
2474
+ "[198Bi]": 2319,
2475
+ "[192Bi]": 2320,
2476
+ "[194Bi]": 2321,
2477
+ "[196Bi]": 2322,
2478
+ "[132I-]": 2323,
2479
+ "[83Sr+2]": 2324,
2480
+ "[169Er+3]": 2325,
2481
+ "[122I-]": 2326,
2482
+ "[120I-]": 2327,
2483
+ "[92Sr+2]": 2328,
2484
+ "[126I-]": 2329,
2485
+ "[24Mg]": 2330,
2486
+ "[84Sr]": 2331,
2487
+ "[118Pd+2]": 2332,
2488
+ "[118Pd]": 2333,
2489
+ "[AsH4]": 2334,
2490
+ "[127I-]": 2335,
2491
+ "[9C-]": 2336,
2492
+ "[11CH3+]": 2337,
2493
+ "[17B]": 2338,
2494
+ "[7B]": 2339,
2495
+ "[4HH]": 2340,
2496
+ "[18C-]": 2341,
2497
+ "[22CH3-]": 2342,
2498
+ "[22CH4]": 2343,
2499
+ "[17C-]": 2344,
2500
+ "[15CH3]": 2345,
2501
+ "[16CH3]": 2346,
2502
+ "[11NH3]": 2347,
2503
+ "[21NH3]": 2348,
2504
+ "[11N-]": 2349,
2505
+ "[11NH]": 2350,
2506
+ "[16CH]": 2351,
2507
+ "[17CH2]": 2352,
2508
+ "[99Ru+2]": 2353,
2509
+ "[181Ta+2]": 2354,
2510
+ "[181Ta]": 2355,
2511
+ "[20CH]": 2356,
2512
+ "[32PH2]": 2357,
2513
+ "[55Fe+2]": 2358,
2514
+ "[SH3]": 2359,
2515
+ "[S@H]": 2360,
2516
+ "<unk>": 2361
2517
+ },
2518
+ "unk_token": "<unk>"
2519
+ }
2520
+ }
tokenizer_config.json CHANGED
@@ -41,6 +41,12 @@
41
  "special": true
42
  }
43
  },
 
 
 
 
 
 
44
  "clean_up_tokenization_spaces": true,
45
  "cls_token": "<bos>",
46
  "mask_token": "<mask>",
 
41
  "special": true
42
  }
43
  },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenization_molformer.MolformerTokenizer",
47
+ "tokenization_molformer_fast.MolformerTokenizerFast"
48
+ ]
49
+ },
50
  "clean_up_tokenization_spaces": true,
51
  "cls_token": "<bos>",
52
  "mask_token": "<mask>",