gbyuvd commited on
Commit
67fbcc9
·
verified ·
1 Parent(s): d7a37a2

first commit

Browse files
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 384,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": true,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": false
10
+ }
README.md CHANGED
@@ -1,3 +1,197 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets: [COCONUTDB]
3
+ language: [code]
4
+ library_name: sentence-transformers
5
+ metrics:
6
+ - pearson_cosine
7
+ - spearman_cosine
8
+ - pearson_manhattan
9
+ - spearman_manhattan
10
+ - pearson_euclidean
11
+ - spearman_euclidean
12
+ - pearson_dot
13
+ - spearman_dot
14
+ - pearson_max
15
+ - spearman_max
16
+ pipeline_tag: sentence-similarity
17
+ tags:
18
+ - sentence-transformers
19
+ - sentence-similarity
20
+ - feature-extraction
21
+ - generated_from_trainer
22
+ - dataset_size:1183174
23
+ - loss:CosineSimilarityLoss
24
+ widget:
25
+ - source_sentence: '[O][=C][Branch2][Branch2][Ring1][O][C][C][Branch2][Ring1][=Branch1][O][C][=Branch1][C][=O][C][C][C][C][C][C][=C][C][C][C][C][C][C][C][C][C][C][O][P][=Branch1][C][=O][Branch1][C][O][O][C][C][Branch2][Ring1][Branch1][O][C][O][C][Branch1][Ring1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Ring1][#Branch2][O][C][Branch1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Ring2][Ring1][Branch1][O][C][O][C][Branch1][Ring1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Ring1][#Branch2][O][C][C][C][C][C][C][C][C][C][C][C][C][C][C]'
26
+ sentences:
27
+ - '[O][=C][Branch2][Ring1][N][N][N][=C][Branch1][N][C][=C][C][=C][Branch1][C][Cl][C][=C][Ring1][#Branch1][C][=C][C][=C][Branch1][C][Cl][C][=C][Ring1][#Branch1][C][=C][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=C][Ring1][#Branch2][O]'
28
+ - '[O][=C][Branch1][C][O][C][=C][Branch1][C][C][C][C][=Branch1][C][=O][O][C]'
29
+ - '[O][=C][Branch1][C][O][C][C][C][C][C][C][C][C][Branch1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][#C][C][Branch1][C][O][C][C][C][C]'
30
+ - source_sentence: '[O][=C][Branch1][#Branch1][O][C][Branch1][C][C][C][C][C][C][C][C][C][C][C]'
31
+ sentences:
32
+ - '[O][=C][O][C][C][Branch1][C][O][C][C][Ring1][#Branch1][=C][C][C][C][C][C][C][C][C][C][C][C][C][C]'
33
+ - '[O][=C][C][=C][C][O][C][O][C][=Ring1][Branch1][C][=C][Ring1][=Branch2][Br]'
34
+ - '[O][=C][Branch2][#Branch1][=C][O][C][C][=Branch1][C][=C][C][C][Branch1][#Branch1][O][C][=Branch1][C][=O][C][C][C][C][=Branch1][C][=O][C][Branch2][=Branch1][Ring1][O][C][C][Ring1][Branch2][Branch1][C][C][C][Ring1][=Branch1][Branch1][C][O][C][Branch1][#Branch1][O][C][=Branch1][C][=O][C][C][Branch1][#Branch1][O][C][=Branch1][C][=O][C][C][Ring2][Ring1][N][Branch1][#C][C][O][C][=Branch1][C][=O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][Branch1][Branch2][C][O][C][=Branch1][C][=O][C][C][Ring2][Ring2][S][C][C][=C][C][=C][C][=C][C][=C][Ring1][=Branch1]'
35
+ - source_sentence: '[O][=C][O][C][=C][Branch2][Ring1][#C][C][=C][C][O][C][N][Branch1][S][C][=C][C][=C][Branch1][=Branch1][O][C][C][C][C][C][=C][Ring1][O][C][C][Ring2][Ring1][Branch1][=Ring1][P][C][=Branch1][=C][=C][Ring2][Ring1][=Branch2][C][C][=C][C][=C][C][=C][Ring1][=Branch1][C]'
36
+ sentences:
37
+ - '[O][=C][N][C][Branch1][S][C][=Branch1][C][=O][N][C][=C][C][=C][C][=C][Ring1][N][Ring1][=Branch1][C][C][=Branch1][C][=O][N][C][C][C][=N][C][=Branch1][Branch1][=C][S][Ring1][Branch1][C]'
38
+ - '[O][=C][C][=C][Branch2][Branch2][O][O][C][=C][C][Branch2][Ring2][#C][O][C][C][Branch1][Ring1][C][O][C][C][C][=C][C][NH1][C][=C][C][=Ring1][Branch1][C][=C][Ring1][=Branch2][C][C][=Branch1][C][=O][C][Branch1][Ring1][C][O][C][C][=Branch1][Branch2][=C][Ring2][Ring1][#C][Ring2][Ring1][O][C][Ring1][#Branch2][=C][C][Branch2][Ring2][O][O][C][Branch1][Ring2][C][Ring1][Branch1][C][Branch1][C][O][C][Branch2][Ring1][Branch2][C][=C][C][C][Branch1][S][N][Branch1][=Branch1][C][C][C][O][C][C][C][C][Ring1][S][Ring1][O][C][C][C][C][C][O][=C][Ring2][Branch1][=Branch2][C][O][C][=Branch1][C][=O][O][C][C]'
39
+ - '[O][C][C][O][C][Branch2][Branch2][#Branch2][O][C][C][Branch1][C][O][C][Branch1][C][O][C][Branch2][#Branch1][#Branch2][O][C][Ring1][Branch2][O][C][C][C][C][Branch2][=Branch1][=N][C][=Branch2][Branch1][P][=C][C][Branch1][C][O][C][Branch1][C][C][C][Ring1][Branch2][C][C][Branch1][C][O][C][C][Branch1][=Branch2][C][C][C][Ring1][O][Ring1][Branch1][C][C][Branch2][Ring1][Branch1][O][C][O][C][Branch1][Ring1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Ring1][#Branch2][O][Branch1][C][C][C][C][=C][C][Branch1][C][C][C][C][Ring2][Ring2][=Branch2][Branch1][C][C][C][C][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Ring2][Branch1][S][O]'
40
+ - source_sentence: '[O][=C][O][C][=C][C][=C][Branch1][O][O][C][=Branch1][C][=O][N][Branch1][C][C][C][C][=C][Ring1][N][C][=Branch1][Ring2][=C][Ring1][S][C][O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=Ring1][=Branch2]'
41
+ sentences:
42
+ - '[O][=C][C][=Branch2][#Branch1][=N][=C][C][Branch2][Ring2][N][C][NH1+1][C][=Branch2][Ring2][Ring1][=C][Branch1][#Branch2][C][=N][C][=C][C][Ring1][Branch2][Ring1][Branch1][C][C][=C][C][=Branch1][S][=C][C][=Branch1][Ring2][=C][Ring1][=Branch1][C][C][C][O][C][C][Ring1][=Branch1][C][C][C][O][C][Branch1][C][O][C][C][Branch1][C][C][C][C][C][=Branch1][C][=O][C][Branch1][C][C][Branch1][C][C][C][Ring1][#Branch2][C][C][C][Ring1][=C][Branch1][C][C][C][Ring2][Ring2][=C][Branch1][C][C][C][Ring2][Branch1][C][C][Branch1][C][C][C][C][Branch1][C][O][C][O][C][Ring1][Ring1][Branch1][C][C][C]'
43
+ - '[O][=C][Branch1][C][O][C][Branch2][O][P][O][C][C][=Branch1][C][=O][N][C][C][Branch1][C][O][C][C][Branch2][=Branch2][=Branch2][O][C][C][=Branch1][C][=O][N][C][C][Branch1][C][O][C][C][Branch2][=Branch1][P][O][C][C][Branch1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Branch2][Branch1][#Branch2][O][P][=Branch1][C][=O][Branch1][C][O][O][C][C][Branch2][Ring1][O][N][C][=Branch1][C][=O][C][C][C][C][C][C][C][C][C][C][C][C][C][C][C][C][C][C][C][C][C][C][Branch1][C][O][C][=C][C][C][C][C][C][C][C][C][C][C][C][C][Ring2][Branch1][#Branch1][O][Branch1][P][O][C][Ring2][Branch1][S][C][Branch1][C][O][C][Branch1][C][O][C][O][C][C][=Branch1][C][=O][O][Branch1][P][O][C][Ring2][#Branch1][=Branch1][C][Branch1][C][O][C][Branch1][C][O][C][O][C][C][=Branch1][C][=O][O][O][C][Branch1][N][C][Branch1][C][O][C][Branch1][C][O][C][O][C][C][Branch1][Branch2][N][C][=Branch1][C][=O][C][O][C][Branch1][C][O][C][Ring2][=Branch2][Branch2]'
44
+ - '[O][=C][Branch1][C][N][C][=N][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=Branch1][C][=O][N][Ring1][O][C][C][O][C]'
45
+ - source_sentence: '[O][=C][Branch1][#Branch1][C][=C][C][C][C][=C][C]'
46
+ sentences:
47
+ - '[O][=C][Branch1][C][O][C][C][C][C][C][C][C][C][Branch1][C][O][C][=C][C][#C][C][=C][C][C][C]'
48
+ - '[O][C][C][O][C][Branch2][=Branch2][Ring1][O][C][Branch1][C][C][C][C][C][Branch1][C][O][O][C][C][C][C][C][C][C][C][C][Branch2][Branch1][N][O][C][O][C][Branch1][Ring1][C][O][C][Branch2][Ring2][#Branch1][O][C][O][C][Branch1][Ring1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Ring1][#Branch2][O][C][O][C][Branch1][C][C][C][Branch1][C][O][C][Branch1][C][O][C][Ring1][=Branch2][O][C][Branch1][C][O][C][Ring2][Ring1][#C][O][C][C][C][Ring2][Ring2][#Branch1][Branch1][C][C][C][Ring2][Ring2][N][C][C][C][Ring2][Ring2][S][Branch1][C][C][C][Ring2][Branch1][Ring2][C][Ring2][Branch1][Branch2][C][C][Branch1][C][O][C][Branch1][C][O][C][Ring2][=Branch1][=Branch1][O]'
49
+ - '[O][=C][Branch1][#Branch2][C][=C][C][#C][C][#C][C][#C][C][N][C][C][C][=C][C][=C][C][=C][Ring1][=Branch1]'
50
+ model-index:
51
+ - name: SentenceTransformer
52
+ results:
53
+ - task:
54
+ type: semantic-similarity
55
+ name: Semantic Similarity
56
+ dataset:
57
+ name: NP isotest
58
+ type: NP-isotest
59
+ metrics:
60
+ - type: pearson_cosine
61
+ value: 0.936731178796972
62
+ name: Pearson Cosine
63
+ - type: spearman_cosine
64
+ value: 0.93027366634068
65
+ name: Spearman Cosine
66
+ - type: pearson_manhattan
67
+ value: 0.826340669261792
68
+ name: Pearson Manhattan
69
+ - type: spearman_manhattan
70
+ value: 0.845192256146849
71
+ name: Spearman Manhattan
72
+ - type: pearson_euclidean
73
+ value: 0.842726066770598
74
+ name: Pearson Euclidean
75
+ - type: spearman_euclidean
76
+ value: 0.865381289346298
77
+ name: Spearman Euclidean
78
+ - type: pearson_dot
79
+ value: 0.924283770507162
80
+ name: Pearson Dot
81
+ - type: spearman_dot
82
+ value: 0.923230424410894
83
+ name: Spearman Dot
84
+ - type: pearson_max
85
+ value: 0.936731178796972
86
+ name: Pearson Max
87
+ - type: spearman_max
88
+ value: 0.93027366634068
89
+ name: Spearman Max
90
+ ---
91
+
92
+ # ChEmbed v0.1
93
+
94
+ This prototype is a [sentence-transformers](https://www.SBERT.net) based on [MiniLM-L6-H384-uncased](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) fine-tuned on around 1 million pairs of valid natural compounds' SELFIES [(Krenn et al. 2020)](https://github.com/aspuru-guzik-group/selfies) taken from COCONUTDB [(Sorokina et al. 2021)](https://coconut.naturalproducts.net/). It maps compounds' *Self-Referencing Embedded Strings* (SELFIES) into a 768-dimensional dense vector space, potentially can be used for chemical similarity, similarity search, classification, clustering, and more.
95
+
96
+ I am planning to train this model with more epochs on current dataset, before moving on to a larger dataset with 6 million pairs generated from ChemBL34. However, this will take some time due to computational and financial constraints. A future project of mine is to develop a custom model specifically for cheminformatics to address any biases and optimization issues in repurposing an embedding model designed for NLP tasks.
97
+
98
+ ### Disclaimer: For Academic Purposes Only
99
+ The information and model provided is for academic purposes only. It is intended for educational and research use, and should not be used for any commercial or legal purposes. The author do not guarantee the accuracy, completeness, or reliability of the information.
100
+
101
+ ## Model Details
102
+
103
+ ### Model Description
104
+ - **Model Type:** Sentence Transformer
105
+ - **Base model:** [MiniLM-L6-H384-uncased](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased)
106
+ - **Maximum Sequence Length:** 512 tokens
107
+ - **Output Dimensionality:** 768 tokens
108
+ - **Similarity Function:** Cosine Similarity
109
+ - **Training Dataset:** SELFIES pairs generated from COCONUTDB
110
+ - **Language:** SELFIES
111
+ - **License:** CC BY-NC 4.0
112
+
113
+
114
+ ### Full Model Architecture
115
+
116
+ ```
117
+ SentenceTransformer(
118
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
119
+ (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': True, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': False})
120
+ )
121
+ ```
122
+
123
+ ## Usage
124
+
125
+ ### Direct Usage (Sentence Transformers)
126
+
127
+ First install the Sentence Transformers library:
128
+
129
+ ```bash
130
+ pip install -U sentence-transformers
131
+ ```
132
+
133
+ Then you can load this model and run inference.
134
+ ```python
135
+ from sentence_transformers import SentenceTransformer
136
+
137
+ # Download from the 🤗 Hub
138
+ model = SentenceTransformer("sentence_transformers_model_id")
139
+ # Run inference
140
+ sentences = [
141
+ '[O][=C][Branch1][#Branch1][C][=C][C][C][C][=C][C]',
142
+ '[O][=C][Branch1][C][O][C][C][C][C][C][C][C][C][Branch1][C][O][C][=C][C][#C][C][=C][C][C][C]',
143
+ '[O][C][C][O][C][Branch2][=Branch2][Ring1][O][C][Branch1][C][C][C][C][C][Branch1][C][O][O][C][C][C][C][C][C][C][C][C][Branch2][Branch1][N][O][C][O][C][Branch1][Ring1][C][O][C][Branch2][Ring2][#Branch1][O][C][O][C][Branch1][Ring1][C][O][C][Branch1][C][O][C][Branch1][C][O][C][Ring1][#Branch2][O][C][O][C][Branch1][C][C][C][Branch1][C][O][C][Branch1][C][O][C][Ring1][=Branch2][O][C][Branch1][C][O][C][Ring2][Ring1][#C][O][C][C][C][Ring2][Ring2][#Branch1][Branch1][C][C][C][Ring2][Ring2][N][C][C][C][Ring2][Ring2][S][Branch1][C][C][C][Ring2][Branch1][Ring2][C][Ring2][Branch1][Branch2][C][C][Branch1][C][O][C][Branch1][C][O][C][Ring2][=Branch1][=Branch1][O]',
144
+ ]
145
+ embeddings = model.encode(sentences)
146
+ print(embeddings.shape)
147
+ # [3, 768]
148
+
149
+ # Get the similarity scores for the embeddings
150
+ similarities = model.similarity(embeddings, embeddings)
151
+ print(similarities.shape)
152
+ # [3, 3]
153
+ ```
154
+
155
+ ## Dataset
156
+
157
+ | Dataset | Reference | Number of Pairs |
158
+ |:---------------------------|:-----------|:-----------|
159
+ | COCONUTDB (0.8:0.1:0.1 split) | [(Sorokina et al. 2021)](https://coconut.naturalproducts.net/) | 1183174 |
160
+
161
+
162
+ ## Evaluation
163
+
164
+ ### Metrics
165
+
166
+ #### Semantic Similarity
167
+ * Dataset: `NP-isotest`
168
+ * Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator)
169
+
170
+ | Metric | Value |
171
+ |:--------------------|:-----------|
172
+ | pearson_cosine | 0.9367 |
173
+ | **spearman_cosine** | **0.9303** |
174
+ | pearson_manhattan | 0.8263 |
175
+ | spearman_manhattan | 0.8452 |
176
+ | pearson_euclidean | 0.8654 |
177
+ | spearman_euclidean | 0.9243 |
178
+ | pearson_dot | 0.9232 |
179
+ | spearman_dot | 0.9367 |
180
+ | pearson_max | 0.9303 |
181
+ | spearman_max | 0.8961 |
182
+
183
+ ## Limitations
184
+ For now, the model might be ineffective in embedding synthetic drugs, since it is still trained on just natural products. Also, the tokenizer used is still uncustomized one.
185
+
186
+ ### Framework Versions
187
+ - Python: 3.9.13
188
+ - Sentence Transformers: 3.0.1
189
+ - Transformers: 4.41.2
190
+ - PyTorch: 2.3.1+cu121
191
+ - Accelerate: 0.31.0
192
+ - Datasets: 2.20.0
193
+ - Tokenizers: 0.19.1
194
+
195
+ ## Contact
196
+
197
+ G Bayu (gbyuvd@proton.me)
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./output_NP/checkpoint4",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1536,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.41.2",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.1",
4
+ "transformers": "4.41.2",
5
+ "pytorch": "2.3.1+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e43427517384ee34089ea4da39dbb6c090433eea56d56837c5f7826e569978d2
3
+ size 90864192
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "max_length": 512,
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "stride": 0,
58
+ "strip_accents": null,
59
+ "tokenize_chinese_chars": true,
60
+ "tokenizer_class": "BertTokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]"
64
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff