haydn-jones commited on
Commit
fc3732e
·
1 Parent(s): 0db435e

Create tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +233 -0
tokenizer.json ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": {
5
+ "strategy": "BatchLongest",
6
+ "direction": "Right",
7
+ "pad_to_multiple_of": null,
8
+ "pad_id": 3,
9
+ "pad_type_id": 0,
10
+ "pad_token": "<PAD>"
11
+ },
12
+ "added_tokens": [
13
+ {
14
+ "id": 0,
15
+ "content": "<UNK>",
16
+ "single_word": false,
17
+ "lstrip": false,
18
+ "rstrip": false,
19
+ "normalized": false,
20
+ "special": true
21
+ },
22
+ {
23
+ "id": 1,
24
+ "content": "<CLS>",
25
+ "single_word": false,
26
+ "lstrip": false,
27
+ "rstrip": false,
28
+ "normalized": false,
29
+ "special": true
30
+ },
31
+ {
32
+ "id": 2,
33
+ "content": "<EOS>",
34
+ "single_word": false,
35
+ "lstrip": false,
36
+ "rstrip": false,
37
+ "normalized": false,
38
+ "special": true
39
+ },
40
+ {
41
+ "id": 3,
42
+ "content": "<PAD>",
43
+ "single_word": false,
44
+ "lstrip": false,
45
+ "rstrip": false,
46
+ "normalized": false,
47
+ "special": true
48
+ }
49
+ ],
50
+ "normalizer": null,
51
+ "pre_tokenizer": {
52
+ "type": "Split",
53
+ "pattern": {
54
+ "String": "]"
55
+ },
56
+ "behavior": "MergedWithPrevious",
57
+ "invert": false
58
+ },
59
+ "post_processor": {
60
+ "type": "TemplateProcessing",
61
+ "single": [
62
+ {
63
+ "SpecialToken": {
64
+ "id": "<CLS>",
65
+ "type_id": 0
66
+ }
67
+ },
68
+ {
69
+ "Sequence": {
70
+ "id": "A",
71
+ "type_id": 0
72
+ }
73
+ },
74
+ {
75
+ "SpecialToken": {
76
+ "id": "<EOS>",
77
+ "type_id": 0
78
+ }
79
+ }
80
+ ],
81
+ "pair": [
82
+ {
83
+ "Sequence": {
84
+ "id": "A",
85
+ "type_id": 0
86
+ }
87
+ },
88
+ {
89
+ "Sequence": {
90
+ "id": "B",
91
+ "type_id": 1
92
+ }
93
+ }
94
+ ],
95
+ "special_tokens": {
96
+ "<CLS>": {
97
+ "id": "<CLS>",
98
+ "ids": [
99
+ 1
100
+ ],
101
+ "tokens": [
102
+ "<CLS>"
103
+ ]
104
+ },
105
+ "<EOS>": {
106
+ "id": "<EOS>",
107
+ "ids": [
108
+ 2
109
+ ],
110
+ "tokens": [
111
+ "<EOS>"
112
+ ]
113
+ }
114
+ }
115
+ },
116
+ "decoder": {
117
+ "type": "Metaspace",
118
+ "replacement": "_",
119
+ "add_prefix_space": false,
120
+ "prepend_scheme": "always"
121
+ },
122
+ "model": {
123
+ "type": "WordLevel",
124
+ "vocab": {
125
+ "<UNK>": 0,
126
+ "<CLS>": 1,
127
+ "<EOS>": 2,
128
+ "<PAD>": 3,
129
+ "[C]": 4,
130
+ "[=C]": 5,
131
+ "[Ring1]": 6,
132
+ "[Branch1]": 7,
133
+ "[N]": 8,
134
+ "[=Branch1]": 9,
135
+ "[O]": 10,
136
+ "[=O]": 11,
137
+ "[Ring2]": 12,
138
+ "[Branch2]": 13,
139
+ "[=N]": 14,
140
+ "[S]": 15,
141
+ "[#Branch1]": 16,
142
+ "[=Branch2]": 17,
143
+ "[F]": 18,
144
+ "[#Branch2]": 19,
145
+ "[#C]": 20,
146
+ "[Cl]": 21,
147
+ "[P]": 22,
148
+ "[NH1]": 23,
149
+ "[=Ring1]": 24,
150
+ "[O-1]": 25,
151
+ "[N+1]": 26,
152
+ "[Br]": 27,
153
+ "[#N]": 28,
154
+ "[=Ring2]": 29,
155
+ "[=S]": 30,
156
+ "[=N+1]": 31,
157
+ "[I]": 32,
158
+ "[S+1]": 33,
159
+ "[B]": 34,
160
+ "[Si]": 35,
161
+ "[=N-1]": 36,
162
+ "[=P]": 37,
163
+ "[Se]": 38,
164
+ "[H]": 39,
165
+ "[N-1]": 40,
166
+ "[C-1]": 41,
167
+ "[#N+1]": 42,
168
+ "[P+1]": 43,
169
+ "[OH0]": 44,
170
+ "[B-1]": 45,
171
+ "[PH1]": 46,
172
+ "[S-1]": 47,
173
+ "[=S+1]": 48,
174
+ "[=O+1]": 49,
175
+ "[=Se]": 50,
176
+ "[C+1]": 51,
177
+ "[NH3+1]": 52,
178
+ "[NH1+1]": 53,
179
+ "[BH2-1]": 54,
180
+ "[NH2+1]": 55,
181
+ "[O+1]": 56,
182
+ "[SeH1]": 57,
183
+ "[SH1]": 58,
184
+ "[=Se+1]": 59,
185
+ "[SiH2]": 60,
186
+ "[=OH1+1]": 61,
187
+ "[=SH1]": 62,
188
+ "[=PH1]": 63,
189
+ "[I+1]": 64,
190
+ "[#C-1]": 65,
191
+ "[=NH1+1]": 66,
192
+ "[CH1-1]": 67,
193
+ "[=NH2+1]": 68,
194
+ "[BH3-1]": 69,
195
+ "[NH1-1]": 70,
196
+ "[CH1+1]": 71,
197
+ "[BH1-1]": 72,
198
+ "[Se+1]": 73,
199
+ "[SiH1]": 74,
200
+ "[=C-1]": 75,
201
+ "[=Si]": 76,
202
+ "[F+1]": 77,
203
+ "[=B-1]": 78,
204
+ "[=B]": 79,
205
+ "[BH0]": 80,
206
+ "[CH1]": 81,
207
+ "[CH2+1]": 82,
208
+ "[Cl+1]": 83,
209
+ "[NH0]": 84,
210
+ "[#O+1]": 85,
211
+ "[#S]": 86,
212
+ "[Br+2]": 87,
213
+ "[Br-1]": 88,
214
+ "[CH2]": 89,
215
+ "[Cl+2]": 90,
216
+ "[Cl+3]": 91,
217
+ "[Cl-1]": 92,
218
+ "[F-1]": 93,
219
+ "[I+2]": 94,
220
+ "[I+3]": 95,
221
+ "[I-1]": 96,
222
+ "[OH1+1]": 97,
223
+ "[PH2+1]": 98,
224
+ "[SH1+1]": 99,
225
+ "[SH1-1]": 100,
226
+ "[Se-1]": 101,
227
+ "[SeH2]": 102,
228
+ "[Si-1]": 103,
229
+ "[SiH1-1]": 104
230
+ },
231
+ "unk_token": "<UNK>"
232
+ }
233
+ }