SagiPolaczek commited on
Commit
1f58eba
1 Parent(s): 8af7711

Push model using huggingface_hub.

Browse files
tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json CHANGED
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<MOLECULAR_ENTITY_ANTIGEN>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -2318,7 +2318,7 @@
2318
  },
2319
  {
2320
  "id": 257,
2321
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIGEN>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
@@ -2327,7 +2327,7 @@
2327
  },
2328
  {
2329
  "id": 258,
2330
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
@@ -2336,7 +2336,7 @@
2336
  },
2337
  {
2338
  "id": 259,
2339
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
@@ -2417,7 +2417,7 @@
2417
  },
2418
  {
2419
  "id": 268,
2420
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
@@ -2426,7 +2426,7 @@
2426
  },
2427
  {
2428
  "id": 269,
2429
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
@@ -2435,7 +2435,7 @@
2435
  },
2436
  {
2437
  "id": 270,
2438
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
@@ -2444,7 +2444,7 @@
2444
  },
2445
  {
2446
  "id": 271,
2447
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
@@ -2453,7 +2453,7 @@
2453
  },
2454
  {
2455
  "id": 272,
2456
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
@@ -2462,7 +2462,7 @@
2462
  },
2463
  {
2464
  "id": 273,
2465
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
@@ -2570,7 +2570,7 @@
2570
  },
2571
  {
2572
  "id": 285,
2573
- "content": "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
@@ -2669,7 +2669,7 @@
2669
  },
2670
  {
2671
  "id": 296,
2672
- "content": "<CDR3_REGION>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
@@ -2678,7 +2678,7 @@
2678
  },
2679
  {
2680
  "id": 297,
2681
- "content": "<GENERAL_CHAIN>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
@@ -2828,33 +2828,6 @@
2828
  "rstrip": false,
2829
  "normalized": false,
2830
  "special": true
2831
- },
2832
- {
2833
- "id": 314,
2834
- "content": "<BBBP>",
2835
- "single_word": false,
2836
- "lstrip": false,
2837
- "rstrip": false,
2838
- "normalized": false,
2839
- "special": true
2840
- },
2841
- {
2842
- "id": 315,
2843
- "content": "<FDA_APPR>",
2844
- "single_word": false,
2845
- "lstrip": false,
2846
- "rstrip": false,
2847
- "normalized": false,
2848
- "special": true
2849
- },
2850
- {
2851
- "id": 316,
2852
- "content": "<HIV_ACTIVITY>",
2853
- "single_word": false,
2854
- "lstrip": false,
2855
- "rstrip": false,
2856
- "normalized": false,
2857
- "special": true
2858
  }
2859
  ],
2860
  "normalizer": null,
@@ -2879,10 +2852,10 @@
2879
  "<EOS>": 5,
2880
  "<MOLECULAR_ENTITY>": 6,
2881
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2882
- "<MOLECULAR_ENTITY_ANTIGEN>": 8,
2883
  "<MOLECULAR_ENTITY_EPITOPE>": 9,
2884
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>": 10,
2885
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>": 11,
2886
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2887
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2888
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
@@ -3128,9 +3101,9 @@
3128
  "<SENTINEL_ID_197>": 254,
3129
  "<SENTINEL_ID_198>": 255,
3130
  "<SENTINEL_ID_199>": 256,
3131
- "<MOLECULAR_ENTITY_TYPE_ANTIGEN>": 257,
3132
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>": 258,
3133
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>": 259,
3134
  "<ATTRIBUTE_ORGANISM>": 260,
3135
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3136
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
@@ -3139,12 +3112,12 @@
3139
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3140
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3141
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3142
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>": 268,
3143
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>": 269,
3144
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>": 270,
3145
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>": 271,
3146
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>": 272,
3147
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>": 273,
3148
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3149
  "<TIMESTEP>": 275,
3150
  "<DIFFUSION>": 276,
@@ -3156,7 +3129,7 @@
3156
  "<BACKSPACE>": 282,
3157
  "<SEQUENCE_NATURAL_START>": 283,
3158
  "<NOOP>": 284,
3159
- "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>": 285,
3160
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3161
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3162
  "<CELL_TYPE_CLASS>": 288,
@@ -3167,8 +3140,8 @@
3167
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3168
  "<COMPLEX_ENTITY>": 294,
3169
  "<ALTERNATIVE>": 295,
3170
- "<CDR3_REGION>": 296,
3171
- "<GENERAL_CHAIN>": 297,
3172
  "<SUBMOLECULAR_ENTITY>": 298,
3173
  "<MUTATED>": 299,
3174
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
@@ -3185,9 +3158,6 @@
3185
  "<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
3186
  "<AUTOENCODER_TASK>": 312,
3187
  "<DECODED_FROM_LATENT>": 313,
3188
- "<BBBP>": 314,
3189
- "<FDA_APPR>": 315,
3190
- "<HIV_ACTIVITY>": 316,
3191
  "#": 527,
3192
  "%": 528,
3193
  "(": 529,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<INTERNAL_0>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<INTERNAL_2>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<INTERNAL_3>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
2318
  },
2319
  {
2320
  "id": 257,
2321
+ "content": "<INTERNAL_17>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
 
2327
  },
2328
  {
2329
  "id": 258,
2330
+ "content": "<INTERNAL_15>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
 
2336
  },
2337
  {
2338
  "id": 259,
2339
+ "content": "<INTERNAL_16>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
 
2417
  },
2418
  {
2419
  "id": 268,
2420
+ "content": "<INTERNAL_7>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
 
2426
  },
2427
  {
2428
  "id": 269,
2429
+ "content": "<INTERNAL_6>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
 
2435
  },
2436
  {
2437
  "id": 270,
2438
+ "content": "<INTERNAL_9>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
 
2444
  },
2445
  {
2446
  "id": 271,
2447
+ "content": "<INTERNAL_5>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
 
2453
  },
2454
  {
2455
  "id": 272,
2456
+ "content": "<INTERNAL_8>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
 
2462
  },
2463
  {
2464
  "id": 273,
2465
+ "content": "<INTERNAL_4>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
 
2570
  },
2571
  {
2572
  "id": 285,
2573
+ "content": "<INTERNAL_14>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
 
2669
  },
2670
  {
2671
  "id": 296,
2672
+ "content": "<INTERNAL_13>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
 
2678
  },
2679
  {
2680
  "id": 297,
2681
+ "content": "<INTERNAL_12>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
 
2828
  "rstrip": false,
2829
  "normalized": false,
2830
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2831
  }
2832
  ],
2833
  "normalizer": null,
 
2852
  "<EOS>": 5,
2853
  "<MOLECULAR_ENTITY>": 6,
2854
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2855
+ "<INTERNAL_0>": 8,
2856
  "<MOLECULAR_ENTITY_EPITOPE>": 9,
2857
+ "<INTERNAL_2>": 10,
2858
+ "<INTERNAL_3>": 11,
2859
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2860
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2861
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
 
3101
  "<SENTINEL_ID_197>": 254,
3102
  "<SENTINEL_ID_198>": 255,
3103
  "<SENTINEL_ID_199>": 256,
3104
+ "<INTERNAL_17>": 257,
3105
+ "<INTERNAL_15>": 258,
3106
+ "<INTERNAL_16>": 259,
3107
  "<ATTRIBUTE_ORGANISM>": 260,
3108
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3109
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
 
3112
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3113
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3114
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3115
+ "<INTERNAL_7>": 268,
3116
+ "<INTERNAL_6>": 269,
3117
+ "<INTERNAL_9>": 270,
3118
+ "<INTERNAL_5>": 271,
3119
+ "<INTERNAL_8>": 272,
3120
+ "<INTERNAL_4>": 273,
3121
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3122
  "<TIMESTEP>": 275,
3123
  "<DIFFUSION>": 276,
 
3129
  "<BACKSPACE>": 282,
3130
  "<SEQUENCE_NATURAL_START>": 283,
3131
  "<NOOP>": 284,
3132
+ "<INTERNAL_14>": 285,
3133
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3134
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3135
  "<CELL_TYPE_CLASS>": 288,
 
3140
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3141
  "<COMPLEX_ENTITY>": 294,
3142
  "<ALTERNATIVE>": 295,
3143
+ "<INTERNAL_13>": 296,
3144
+ "<INTERNAL_12>": 297,
3145
  "<SUBMOLECULAR_ENTITY>": 298,
3146
  "<MUTATED>": 299,
3147
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
 
3158
  "<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
3159
  "<AUTOENCODER_TASK>": 312,
3160
  "<DECODED_FROM_LATENT>": 313,
 
 
 
3161
  "#": 527,
3162
  "%": 528,
3163
  "(": 529,
tokenizer/cell_attributes_tokenizer.json CHANGED
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<MOLECULAR_ENTITY_ANTIGEN>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -2318,7 +2318,7 @@
2318
  },
2319
  {
2320
  "id": 257,
2321
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIGEN>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
@@ -2327,7 +2327,7 @@
2327
  },
2328
  {
2329
  "id": 258,
2330
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
@@ -2336,7 +2336,7 @@
2336
  },
2337
  {
2338
  "id": 259,
2339
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
@@ -2417,7 +2417,7 @@
2417
  },
2418
  {
2419
  "id": 268,
2420
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
@@ -2426,7 +2426,7 @@
2426
  },
2427
  {
2428
  "id": 269,
2429
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
@@ -2435,7 +2435,7 @@
2435
  },
2436
  {
2437
  "id": 270,
2438
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
@@ -2444,7 +2444,7 @@
2444
  },
2445
  {
2446
  "id": 271,
2447
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
@@ -2453,7 +2453,7 @@
2453
  },
2454
  {
2455
  "id": 272,
2456
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
@@ -2462,7 +2462,7 @@
2462
  },
2463
  {
2464
  "id": 273,
2465
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
@@ -2570,7 +2570,7 @@
2570
  },
2571
  {
2572
  "id": 285,
2573
- "content": "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
@@ -2669,7 +2669,7 @@
2669
  },
2670
  {
2671
  "id": 296,
2672
- "content": "<CDR3_REGION>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
@@ -2678,7 +2678,7 @@
2678
  },
2679
  {
2680
  "id": 297,
2681
- "content": "<GENERAL_CHAIN>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
@@ -2828,33 +2828,6 @@
2828
  "rstrip": false,
2829
  "normalized": false,
2830
  "special": true
2831
- },
2832
- {
2833
- "id": 314,
2834
- "content": "<BBBP>",
2835
- "single_word": false,
2836
- "lstrip": false,
2837
- "rstrip": false,
2838
- "normalized": false,
2839
- "special": true
2840
- },
2841
- {
2842
- "id": 315,
2843
- "content": "<FDA_APPR>",
2844
- "single_word": false,
2845
- "lstrip": false,
2846
- "rstrip": false,
2847
- "normalized": false,
2848
- "special": true
2849
- },
2850
- {
2851
- "id": 316,
2852
- "content": "<HIV_ACTIVITY>",
2853
- "single_word": false,
2854
- "lstrip": false,
2855
- "rstrip": false,
2856
- "normalized": false,
2857
- "special": true
2858
  }
2859
  ],
2860
  "normalizer": null,
@@ -2884,10 +2857,10 @@
2884
  "<EOS>": 5,
2885
  "<MOLECULAR_ENTITY>": 6,
2886
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2887
- "<MOLECULAR_ENTITY_ANTIGEN>": 8,
2888
  "<MOLECULAR_ENTITY_EPITOPE>": 9,
2889
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>": 10,
2890
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>": 11,
2891
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2892
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2893
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
@@ -3133,9 +3106,9 @@
3133
  "<SENTINEL_ID_197>": 254,
3134
  "<SENTINEL_ID_198>": 255,
3135
  "<SENTINEL_ID_199>": 256,
3136
- "<MOLECULAR_ENTITY_TYPE_ANTIGEN>": 257,
3137
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>": 258,
3138
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>": 259,
3139
  "<ATTRIBUTE_ORGANISM>": 260,
3140
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3141
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
@@ -3144,12 +3117,12 @@
3144
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3145
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3146
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3147
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>": 268,
3148
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>": 269,
3149
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>": 270,
3150
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>": 271,
3151
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>": 272,
3152
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>": 273,
3153
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3154
  "<TIMESTEP>": 275,
3155
  "<DIFFUSION>": 276,
@@ -3161,7 +3134,7 @@
3161
  "<BACKSPACE>": 282,
3162
  "<SEQUENCE_NATURAL_START>": 283,
3163
  "<NOOP>": 284,
3164
- "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>": 285,
3165
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3166
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3167
  "<CELL_TYPE_CLASS>": 288,
@@ -3172,8 +3145,8 @@
3172
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3173
  "<COMPLEX_ENTITY>": 294,
3174
  "<ALTERNATIVE>": 295,
3175
- "<CDR3_REGION>": 296,
3176
- "<GENERAL_CHAIN>": 297,
3177
  "<SUBMOLECULAR_ENTITY>": 298,
3178
  "<MUTATED>": 299,
3179
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
@@ -3190,9 +3163,6 @@
3190
  "<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
3191
  "<AUTOENCODER_TASK>": 312,
3192
  "<DECODED_FROM_LATENT>": 313,
3193
- "<BBBP>": 314,
3194
- "<FDA_APPR>": 315,
3195
- "<HIV_ACTIVITY>": 316,
3196
  "[CL:0000499]": 3522,
3197
  "[CL:2000060]": 3523,
3198
  "[CL:0000235]": 3524,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<INTERNAL_0>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<INTERNAL_2>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<INTERNAL_3>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
2318
  },
2319
  {
2320
  "id": 257,
2321
+ "content": "<INTERNAL_17>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
 
2327
  },
2328
  {
2329
  "id": 258,
2330
+ "content": "<INTERNAL_15>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
 
2336
  },
2337
  {
2338
  "id": 259,
2339
+ "content": "<INTERNAL_16>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
 
2417
  },
2418
  {
2419
  "id": 268,
2420
+ "content": "<INTERNAL_7>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
 
2426
  },
2427
  {
2428
  "id": 269,
2429
+ "content": "<INTERNAL_6>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
 
2435
  },
2436
  {
2437
  "id": 270,
2438
+ "content": "<INTERNAL_9>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
 
2444
  },
2445
  {
2446
  "id": 271,
2447
+ "content": "<INTERNAL_5>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
 
2453
  },
2454
  {
2455
  "id": 272,
2456
+ "content": "<INTERNAL_8>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
 
2462
  },
2463
  {
2464
  "id": 273,
2465
+ "content": "<INTERNAL_4>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
 
2570
  },
2571
  {
2572
  "id": 285,
2573
+ "content": "<INTERNAL_14>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
 
2669
  },
2670
  {
2671
  "id": 296,
2672
+ "content": "<INTERNAL_13>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
 
2678
  },
2679
  {
2680
  "id": 297,
2681
+ "content": "<INTERNAL_12>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
 
2828
  "rstrip": false,
2829
  "normalized": false,
2830
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2831
  }
2832
  ],
2833
  "normalizer": null,
 
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
+ "<INTERNAL_0>": 8,
2861
  "<MOLECULAR_ENTITY_EPITOPE>": 9,
2862
+ "<INTERNAL_2>": 10,
2863
+ "<INTERNAL_3>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
 
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
+ "<INTERNAL_17>": 257,
3110
+ "<INTERNAL_15>": 258,
3111
+ "<INTERNAL_16>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
 
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
+ "<INTERNAL_7>": 268,
3121
+ "<INTERNAL_6>": 269,
3122
+ "<INTERNAL_9>": 270,
3123
+ "<INTERNAL_5>": 271,
3124
+ "<INTERNAL_8>": 272,
3125
+ "<INTERNAL_4>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
 
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
+ "<INTERNAL_14>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
 
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
+ "<INTERNAL_13>": 296,
3149
+ "<INTERNAL_12>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
 
3163
  "<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
3164
  "<AUTOENCODER_TASK>": 312,
3165
  "<DECODED_FROM_LATENT>": 313,
 
 
 
3166
  "[CL:0000499]": 3522,
3167
  "[CL:2000060]": 3523,
3168
  "[CL:0000235]": 3524,
tokenizer/gene_tokenizer.json CHANGED
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<MOLECULAR_ENTITY_ANTIGEN>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -2318,7 +2318,7 @@
2318
  },
2319
  {
2320
  "id": 257,
2321
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIGEN>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
@@ -2327,7 +2327,7 @@
2327
  },
2328
  {
2329
  "id": 258,
2330
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
@@ -2336,7 +2336,7 @@
2336
  },
2337
  {
2338
  "id": 259,
2339
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
@@ -2417,7 +2417,7 @@
2417
  },
2418
  {
2419
  "id": 268,
2420
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
@@ -2426,7 +2426,7 @@
2426
  },
2427
  {
2428
  "id": 269,
2429
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
@@ -2435,7 +2435,7 @@
2435
  },
2436
  {
2437
  "id": 270,
2438
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
@@ -2444,7 +2444,7 @@
2444
  },
2445
  {
2446
  "id": 271,
2447
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
@@ -2453,7 +2453,7 @@
2453
  },
2454
  {
2455
  "id": 272,
2456
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
@@ -2462,7 +2462,7 @@
2462
  },
2463
  {
2464
  "id": 273,
2465
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
@@ -2570,7 +2570,7 @@
2570
  },
2571
  {
2572
  "id": 285,
2573
- "content": "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
@@ -2669,7 +2669,7 @@
2669
  },
2670
  {
2671
  "id": 296,
2672
- "content": "<CDR3_REGION>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
@@ -2678,7 +2678,7 @@
2678
  },
2679
  {
2680
  "id": 297,
2681
- "content": "<GENERAL_CHAIN>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
@@ -2828,33 +2828,6 @@
2828
  "rstrip": false,
2829
  "normalized": false,
2830
  "special": true
2831
- },
2832
- {
2833
- "id": 314,
2834
- "content": "<BBBP>",
2835
- "single_word": false,
2836
- "lstrip": false,
2837
- "rstrip": false,
2838
- "normalized": false,
2839
- "special": true
2840
- },
2841
- {
2842
- "id": 315,
2843
- "content": "<FDA_APPR>",
2844
- "single_word": false,
2845
- "lstrip": false,
2846
- "rstrip": false,
2847
- "normalized": false,
2848
- "special": true
2849
- },
2850
- {
2851
- "id": 316,
2852
- "content": "<HIV_ACTIVITY>",
2853
- "single_word": false,
2854
- "lstrip": false,
2855
- "rstrip": false,
2856
- "normalized": false,
2857
- "special": true
2858
  }
2859
  ],
2860
  "normalizer": null,
@@ -2884,10 +2857,10 @@
2884
  "<EOS>": 5,
2885
  "<MOLECULAR_ENTITY>": 6,
2886
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2887
- "<MOLECULAR_ENTITY_ANTIGEN>": 8,
2888
  "<MOLECULAR_ENTITY_EPITOPE>": 9,
2889
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>": 10,
2890
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>": 11,
2891
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2892
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2893
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
@@ -3133,9 +3106,9 @@
3133
  "<SENTINEL_ID_197>": 254,
3134
  "<SENTINEL_ID_198>": 255,
3135
  "<SENTINEL_ID_199>": 256,
3136
- "<MOLECULAR_ENTITY_TYPE_ANTIGEN>": 257,
3137
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>": 258,
3138
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>": 259,
3139
  "<ATTRIBUTE_ORGANISM>": 260,
3140
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3141
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
@@ -3144,12 +3117,12 @@
3144
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3145
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3146
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3147
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>": 268,
3148
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>": 269,
3149
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>": 270,
3150
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>": 271,
3151
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>": 272,
3152
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>": 273,
3153
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3154
  "<TIMESTEP>": 275,
3155
  "<DIFFUSION>": 276,
@@ -3161,7 +3134,7 @@
3161
  "<BACKSPACE>": 282,
3162
  "<SEQUENCE_NATURAL_START>": 283,
3163
  "<NOOP>": 284,
3164
- "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>": 285,
3165
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3166
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3167
  "<CELL_TYPE_CLASS>": 288,
@@ -3172,8 +3145,8 @@
3172
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3173
  "<COMPLEX_ENTITY>": 294,
3174
  "<ALTERNATIVE>": 295,
3175
- "<CDR3_REGION>": 296,
3176
- "<GENERAL_CHAIN>": 297,
3177
  "<SUBMOLECULAR_ENTITY>": 298,
3178
  "<MUTATED>": 299,
3179
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
@@ -3190,9 +3163,6 @@
3190
  "<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
3191
  "<AUTOENCODER_TASK>": 312,
3192
  "<DECODED_FROM_LATENT>": 313,
3193
- "<BBBP>": 314,
3194
- "<FDA_APPR>": 315,
3195
- "<HIV_ACTIVITY>": 316,
3196
  "[100130093]": 5000,
3197
  "[100133445]": 5001,
3198
  "[100286793]": 5002,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<INTERNAL_0>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<INTERNAL_2>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<INTERNAL_3>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
2318
  },
2319
  {
2320
  "id": 257,
2321
+ "content": "<INTERNAL_17>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
 
2327
  },
2328
  {
2329
  "id": 258,
2330
+ "content": "<INTERNAL_15>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
 
2336
  },
2337
  {
2338
  "id": 259,
2339
+ "content": "<INTERNAL_16>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
 
2417
  },
2418
  {
2419
  "id": 268,
2420
+ "content": "<INTERNAL_7>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
 
2426
  },
2427
  {
2428
  "id": 269,
2429
+ "content": "<INTERNAL_6>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
 
2435
  },
2436
  {
2437
  "id": 270,
2438
+ "content": "<INTERNAL_9>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
 
2444
  },
2445
  {
2446
  "id": 271,
2447
+ "content": "<INTERNAL_5>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
 
2453
  },
2454
  {
2455
  "id": 272,
2456
+ "content": "<INTERNAL_8>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
 
2462
  },
2463
  {
2464
  "id": 273,
2465
+ "content": "<INTERNAL_4>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
 
2570
  },
2571
  {
2572
  "id": 285,
2573
+ "content": "<INTERNAL_14>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
 
2669
  },
2670
  {
2671
  "id": 296,
2672
+ "content": "<INTERNAL_13>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
 
2678
  },
2679
  {
2680
  "id": 297,
2681
+ "content": "<INTERNAL_12>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
 
2828
  "rstrip": false,
2829
  "normalized": false,
2830
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2831
  }
2832
  ],
2833
  "normalizer": null,
 
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
+ "<INTERNAL_0>": 8,
2861
  "<MOLECULAR_ENTITY_EPITOPE>": 9,
2862
+ "<INTERNAL_2>": 10,
2863
+ "<INTERNAL_3>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
 
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
+ "<INTERNAL_17>": 257,
3110
+ "<INTERNAL_15>": 258,
3111
+ "<INTERNAL_16>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
 
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
+ "<INTERNAL_7>": 268,
3121
+ "<INTERNAL_6>": 269,
3122
+ "<INTERNAL_9>": 270,
3123
+ "<INTERNAL_5>": 271,
3124
+ "<INTERNAL_8>": 272,
3125
+ "<INTERNAL_4>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
 
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
+ "<INTERNAL_14>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
 
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
+ "<INTERNAL_13>": 296,
3149
+ "<INTERNAL_12>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
 
3163
  "<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
3164
  "<AUTOENCODER_TASK>": 312,
3165
  "<DECODED_FROM_LATENT>": 313,
 
 
 
3166
  "[100130093]": 5000,
3167
  "[100133445]": 5001,
3168
  "[100286793]": 5002,
tokenizer/t5_tokenizer_AA_special.json CHANGED
@@ -77,7 +77,7 @@
77
  },
78
  {
79
  "id": 8,
80
- "content": "<MOLECULAR_ENTITY_ANTIGEN>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
@@ -95,7 +95,7 @@
95
  },
96
  {
97
  "id": 10,
98
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
@@ -104,7 +104,7 @@
104
  },
105
  {
106
  "id": 11,
107
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
@@ -2318,7 +2318,7 @@
2318
  },
2319
  {
2320
  "id": 257,
2321
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIGEN>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
@@ -2327,7 +2327,7 @@
2327
  },
2328
  {
2329
  "id": 258,
2330
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
@@ -2336,7 +2336,7 @@
2336
  },
2337
  {
2338
  "id": 259,
2339
- "content": "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
@@ -2417,7 +2417,7 @@
2417
  },
2418
  {
2419
  "id": 268,
2420
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
@@ -2426,7 +2426,7 @@
2426
  },
2427
  {
2428
  "id": 269,
2429
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
@@ -2435,7 +2435,7 @@
2435
  },
2436
  {
2437
  "id": 270,
2438
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
@@ -2444,7 +2444,7 @@
2444
  },
2445
  {
2446
  "id": 271,
2447
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
@@ -2453,7 +2453,7 @@
2453
  },
2454
  {
2455
  "id": 272,
2456
- "content": "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
@@ -2462,7 +2462,7 @@
2462
  },
2463
  {
2464
  "id": 273,
2465
- "content": "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
@@ -2570,7 +2570,7 @@
2570
  },
2571
  {
2572
  "id": 285,
2573
- "content": "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
@@ -2669,7 +2669,7 @@
2669
  },
2670
  {
2671
  "id": 296,
2672
- "content": "<CDR3_REGION>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
@@ -2678,7 +2678,7 @@
2678
  },
2679
  {
2680
  "id": 297,
2681
- "content": "<GENERAL_CHAIN>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
@@ -2828,33 +2828,6 @@
2828
  "rstrip": false,
2829
  "normalized": false,
2830
  "special": true
2831
- },
2832
- {
2833
- "id": 314,
2834
- "content": "<BBBP>",
2835
- "single_word": false,
2836
- "lstrip": false,
2837
- "rstrip": false,
2838
- "normalized": false,
2839
- "special": true
2840
- },
2841
- {
2842
- "id": 315,
2843
- "content": "<FDA_APPR>",
2844
- "single_word": false,
2845
- "lstrip": false,
2846
- "rstrip": false,
2847
- "normalized": false,
2848
- "special": true
2849
- },
2850
- {
2851
- "id": 316,
2852
- "content": "<HIV_ACTIVITY>",
2853
- "single_word": false,
2854
- "lstrip": false,
2855
- "rstrip": false,
2856
- "normalized": false,
2857
- "special": true
2858
  }
2859
  ],
2860
  "normalizer": null,
@@ -2884,10 +2857,10 @@
2884
  "<EOS>": 5,
2885
  "<MOLECULAR_ENTITY>": 6,
2886
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2887
- "<MOLECULAR_ENTITY_ANTIGEN>": 8,
2888
  "<MOLECULAR_ENTITY_EPITOPE>": 9,
2889
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN>": 10,
2890
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN>": 11,
2891
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2892
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2893
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
@@ -3133,9 +3106,9 @@
3133
  "<SENTINEL_ID_197>": 254,
3134
  "<SENTINEL_ID_198>": 255,
3135
  "<SENTINEL_ID_199>": 256,
3136
- "<MOLECULAR_ENTITY_TYPE_ANTIGEN>": 257,
3137
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_LIGHT_CHAIN>": 258,
3138
- "<MOLECULAR_ENTITY_TYPE_ANTIBODY_HEAVY_CHAIN>": 259,
3139
  "<ATTRIBUTE_ORGANISM>": 260,
3140
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3141
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
@@ -3144,12 +3117,12 @@
3144
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3145
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3146
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3147
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR1>": 268,
3148
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR3>": 269,
3149
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR3>": 270,
3150
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR2>": 271,
3151
- "<MOLECULAR_ENTITY_ANTIBODY_HEAVY_CHAIN_CDR2>": 272,
3152
- "<MOLECULAR_ENTITY_ANTIBODY_LIGHT_CHAIN_CDR1>": 273,
3153
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3154
  "<TIMESTEP>": 275,
3155
  "<DIFFUSION>": 276,
@@ -3161,7 +3134,7 @@
3161
  "<BACKSPACE>": 282,
3162
  "<SEQUENCE_NATURAL_START>": 283,
3163
  "<NOOP>": 284,
3164
- "<TARGETED_ANTIBODY_DESIGN_ENCODER_ONLY_MODE>": 285,
3165
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3166
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3167
  "<CELL_TYPE_CLASS>": 288,
@@ -3172,8 +3145,8 @@
3172
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3173
  "<COMPLEX_ENTITY>": 294,
3174
  "<ALTERNATIVE>": 295,
3175
- "<CDR3_REGION>": 296,
3176
- "<GENERAL_CHAIN>": 297,
3177
  "<SUBMOLECULAR_ENTITY>": 298,
3178
  "<MUTATED>": 299,
3179
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
@@ -3190,9 +3163,6 @@
3190
  "<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
3191
  "<AUTOENCODER_TASK>": 312,
3192
  "<DECODED_FROM_LATENT>": 313,
3193
- "<BBBP>": 314,
3194
- "<FDA_APPR>": 315,
3195
- "<HIV_ACTIVITY>": 316,
3196
  "A": 501,
3197
  "B": 502,
3198
  "C": 503,
 
77
  },
78
  {
79
  "id": 8,
80
+ "content": "<INTERNAL_0>",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
 
95
  },
96
  {
97
  "id": 10,
98
+ "content": "<INTERNAL_2>",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
 
104
  },
105
  {
106
  "id": 11,
107
+ "content": "<INTERNAL_3>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
 
2318
  },
2319
  {
2320
  "id": 257,
2321
+ "content": "<INTERNAL_17>",
2322
  "single_word": false,
2323
  "lstrip": false,
2324
  "rstrip": false,
 
2327
  },
2328
  {
2329
  "id": 258,
2330
+ "content": "<INTERNAL_15>",
2331
  "single_word": false,
2332
  "lstrip": false,
2333
  "rstrip": false,
 
2336
  },
2337
  {
2338
  "id": 259,
2339
+ "content": "<INTERNAL_16>",
2340
  "single_word": false,
2341
  "lstrip": false,
2342
  "rstrip": false,
 
2417
  },
2418
  {
2419
  "id": 268,
2420
+ "content": "<INTERNAL_7>",
2421
  "single_word": false,
2422
  "lstrip": false,
2423
  "rstrip": false,
 
2426
  },
2427
  {
2428
  "id": 269,
2429
+ "content": "<INTERNAL_6>",
2430
  "single_word": false,
2431
  "lstrip": false,
2432
  "rstrip": false,
 
2435
  },
2436
  {
2437
  "id": 270,
2438
+ "content": "<INTERNAL_9>",
2439
  "single_word": false,
2440
  "lstrip": false,
2441
  "rstrip": false,
 
2444
  },
2445
  {
2446
  "id": 271,
2447
+ "content": "<INTERNAL_5>",
2448
  "single_word": false,
2449
  "lstrip": false,
2450
  "rstrip": false,
 
2453
  },
2454
  {
2455
  "id": 272,
2456
+ "content": "<INTERNAL_8>",
2457
  "single_word": false,
2458
  "lstrip": false,
2459
  "rstrip": false,
 
2462
  },
2463
  {
2464
  "id": 273,
2465
+ "content": "<INTERNAL_4>",
2466
  "single_word": false,
2467
  "lstrip": false,
2468
  "rstrip": false,
 
2570
  },
2571
  {
2572
  "id": 285,
2573
+ "content": "<INTERNAL_14>",
2574
  "single_word": false,
2575
  "lstrip": false,
2576
  "rstrip": false,
 
2669
  },
2670
  {
2671
  "id": 296,
2672
+ "content": "<INTERNAL_13>",
2673
  "single_word": false,
2674
  "lstrip": false,
2675
  "rstrip": false,
 
2678
  },
2679
  {
2680
  "id": 297,
2681
+ "content": "<INTERNAL_12>",
2682
  "single_word": false,
2683
  "lstrip": false,
2684
  "rstrip": false,
 
2828
  "rstrip": false,
2829
  "normalized": false,
2830
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2831
  }
2832
  ],
2833
  "normalizer": null,
 
2857
  "<EOS>": 5,
2858
  "<MOLECULAR_ENTITY>": 6,
2859
  "<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
2860
+ "<INTERNAL_0>": 8,
2861
  "<MOLECULAR_ENTITY_EPITOPE>": 9,
2862
+ "<INTERNAL_2>": 10,
2863
+ "<INTERNAL_3>": 11,
2864
  "<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
2865
  "<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
2866
  "<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
 
3106
  "<SENTINEL_ID_197>": 254,
3107
  "<SENTINEL_ID_198>": 255,
3108
  "<SENTINEL_ID_199>": 256,
3109
+ "<INTERNAL_17>": 257,
3110
+ "<INTERNAL_15>": 258,
3111
+ "<INTERNAL_16>": 259,
3112
  "<ATTRIBUTE_ORGANISM>": 260,
3113
  "<ATTRIBUTE_ORGANISM_HUMAN>": 261,
3114
  "<ATTRIBUTE_ORGANISM_RABBIT>": 262,
 
3117
  "<ATTRIBUTE_ORGANISM_MONKEY>": 265,
3118
  "<ATTRIBUTE_ORGANISM_CAMEL>": 266,
3119
  "<EPITOPE_PARATOPE_PREDICTION>": 267,
3120
+ "<INTERNAL_7>": 268,
3121
+ "<INTERNAL_6>": 269,
3122
+ "<INTERNAL_9>": 270,
3123
+ "<INTERNAL_5>": 271,
3124
+ "<INTERNAL_8>": 272,
3125
+ "<INTERNAL_4>": 273,
3126
  "<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
3127
  "<TIMESTEP>": 275,
3128
  "<DIFFUSION>": 276,
 
3134
  "<BACKSPACE>": 282,
3135
  "<SEQUENCE_NATURAL_START>": 283,
3136
  "<NOOP>": 284,
3137
+ "<INTERNAL_14>": 285,
3138
  "<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
3139
  "<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
3140
  "<CELL_TYPE_CLASS>": 288,
 
3145
  "<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
3146
  "<COMPLEX_ENTITY>": 294,
3147
  "<ALTERNATIVE>": 295,
3148
+ "<INTERNAL_13>": 296,
3149
+ "<INTERNAL_12>": 297,
3150
  "<SUBMOLECULAR_ENTITY>": 298,
3151
  "<MUTATED>": 299,
3152
  "<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
 
3163
  "<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
3164
  "<AUTOENCODER_TASK>": 312,
3165
  "<DECODED_FROM_LATENT>": 313,
 
 
 
3166
  "A": 501,
3167
  "B": 502,
3168
  "C": 503,