SagiPolaczek
commited on
Commit
•
1f58eba
1
Parent(s):
8af7711
Push model using huggingface_hub.
Browse files
tokenizer/bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json
CHANGED
@@ -77,7 +77,7 @@
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
-
"content": "<
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
@@ -95,7 +95,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
-
"content": "<
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
@@ -104,7 +104,7 @@
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
-
"content": "<
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
@@ -2318,7 +2318,7 @@
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
-
"content": "<
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
@@ -2327,7 +2327,7 @@
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
-
"content": "<
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
@@ -2336,7 +2336,7 @@
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
-
"content": "<
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
@@ -2417,7 +2417,7 @@
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
-
"content": "<
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
@@ -2426,7 +2426,7 @@
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
-
"content": "<
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
@@ -2435,7 +2435,7 @@
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
-
"content": "<
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
@@ -2444,7 +2444,7 @@
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
-
"content": "<
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
@@ -2453,7 +2453,7 @@
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
-
"content": "<
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
@@ -2462,7 +2462,7 @@
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
-
"content": "<
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
@@ -2570,7 +2570,7 @@
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
-
"content": "<
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
@@ -2669,7 +2669,7 @@
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
-
"content": "<
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
@@ -2678,7 +2678,7 @@
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
-
"content": "<
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
@@ -2828,33 +2828,6 @@
|
|
2828 |
"rstrip": false,
|
2829 |
"normalized": false,
|
2830 |
"special": true
|
2831 |
-
},
|
2832 |
-
{
|
2833 |
-
"id": 314,
|
2834 |
-
"content": "<BBBP>",
|
2835 |
-
"single_word": false,
|
2836 |
-
"lstrip": false,
|
2837 |
-
"rstrip": false,
|
2838 |
-
"normalized": false,
|
2839 |
-
"special": true
|
2840 |
-
},
|
2841 |
-
{
|
2842 |
-
"id": 315,
|
2843 |
-
"content": "<FDA_APPR>",
|
2844 |
-
"single_word": false,
|
2845 |
-
"lstrip": false,
|
2846 |
-
"rstrip": false,
|
2847 |
-
"normalized": false,
|
2848 |
-
"special": true
|
2849 |
-
},
|
2850 |
-
{
|
2851 |
-
"id": 316,
|
2852 |
-
"content": "<HIV_ACTIVITY>",
|
2853 |
-
"single_word": false,
|
2854 |
-
"lstrip": false,
|
2855 |
-
"rstrip": false,
|
2856 |
-
"normalized": false,
|
2857 |
-
"special": true
|
2858 |
}
|
2859 |
],
|
2860 |
"normalizer": null,
|
@@ -2879,10 +2852,10 @@
|
|
2879 |
"<EOS>": 5,
|
2880 |
"<MOLECULAR_ENTITY>": 6,
|
2881 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2882 |
-
"<
|
2883 |
"<MOLECULAR_ENTITY_EPITOPE>": 9,
|
2884 |
-
"<
|
2885 |
-
"<
|
2886 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2887 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2888 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
@@ -3128,9 +3101,9 @@
|
|
3128 |
"<SENTINEL_ID_197>": 254,
|
3129 |
"<SENTINEL_ID_198>": 255,
|
3130 |
"<SENTINEL_ID_199>": 256,
|
3131 |
-
"<
|
3132 |
-
"<
|
3133 |
-
"<
|
3134 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3135 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3136 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
@@ -3139,12 +3112,12 @@
|
|
3139 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3140 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3141 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3142 |
-
"<
|
3143 |
-
"<
|
3144 |
-
"<
|
3145 |
-
"<
|
3146 |
-
"<
|
3147 |
-
"<
|
3148 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3149 |
"<TIMESTEP>": 275,
|
3150 |
"<DIFFUSION>": 276,
|
@@ -3156,7 +3129,7 @@
|
|
3156 |
"<BACKSPACE>": 282,
|
3157 |
"<SEQUENCE_NATURAL_START>": 283,
|
3158 |
"<NOOP>": 284,
|
3159 |
-
"<
|
3160 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3161 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3162 |
"<CELL_TYPE_CLASS>": 288,
|
@@ -3167,8 +3140,8 @@
|
|
3167 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3168 |
"<COMPLEX_ENTITY>": 294,
|
3169 |
"<ALTERNATIVE>": 295,
|
3170 |
-
"<
|
3171 |
-
"<
|
3172 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3173 |
"<MUTATED>": 299,
|
3174 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
@@ -3185,9 +3158,6 @@
|
|
3185 |
"<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
|
3186 |
"<AUTOENCODER_TASK>": 312,
|
3187 |
"<DECODED_FROM_LATENT>": 313,
|
3188 |
-
"<BBBP>": 314,
|
3189 |
-
"<FDA_APPR>": 315,
|
3190 |
-
"<HIV_ACTIVITY>": 316,
|
3191 |
"#": 527,
|
3192 |
"%": 528,
|
3193 |
"(": 529,
|
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
+
"content": "<INTERNAL_0>",
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
+
"content": "<INTERNAL_2>",
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
+
"content": "<INTERNAL_3>",
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
+
"content": "<INTERNAL_17>",
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
+
"content": "<INTERNAL_15>",
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
+
"content": "<INTERNAL_16>",
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
+
"content": "<INTERNAL_7>",
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
+
"content": "<INTERNAL_6>",
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
+
"content": "<INTERNAL_9>",
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
+
"content": "<INTERNAL_5>",
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
+
"content": "<INTERNAL_8>",
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
+
"content": "<INTERNAL_4>",
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
+
"content": "<INTERNAL_14>",
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
+
"content": "<INTERNAL_13>",
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
+
"content": "<INTERNAL_12>",
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
|
|
2828 |
"rstrip": false,
|
2829 |
"normalized": false,
|
2830 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2831 |
}
|
2832 |
],
|
2833 |
"normalizer": null,
|
|
|
2852 |
"<EOS>": 5,
|
2853 |
"<MOLECULAR_ENTITY>": 6,
|
2854 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2855 |
+
"<INTERNAL_0>": 8,
|
2856 |
"<MOLECULAR_ENTITY_EPITOPE>": 9,
|
2857 |
+
"<INTERNAL_2>": 10,
|
2858 |
+
"<INTERNAL_3>": 11,
|
2859 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2860 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2861 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
|
|
3101 |
"<SENTINEL_ID_197>": 254,
|
3102 |
"<SENTINEL_ID_198>": 255,
|
3103 |
"<SENTINEL_ID_199>": 256,
|
3104 |
+
"<INTERNAL_17>": 257,
|
3105 |
+
"<INTERNAL_15>": 258,
|
3106 |
+
"<INTERNAL_16>": 259,
|
3107 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3108 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3109 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
|
|
3112 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3113 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3114 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3115 |
+
"<INTERNAL_7>": 268,
|
3116 |
+
"<INTERNAL_6>": 269,
|
3117 |
+
"<INTERNAL_9>": 270,
|
3118 |
+
"<INTERNAL_5>": 271,
|
3119 |
+
"<INTERNAL_8>": 272,
|
3120 |
+
"<INTERNAL_4>": 273,
|
3121 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3122 |
"<TIMESTEP>": 275,
|
3123 |
"<DIFFUSION>": 276,
|
|
|
3129 |
"<BACKSPACE>": 282,
|
3130 |
"<SEQUENCE_NATURAL_START>": 283,
|
3131 |
"<NOOP>": 284,
|
3132 |
+
"<INTERNAL_14>": 285,
|
3133 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3134 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3135 |
"<CELL_TYPE_CLASS>": 288,
|
|
|
3140 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3141 |
"<COMPLEX_ENTITY>": 294,
|
3142 |
"<ALTERNATIVE>": 295,
|
3143 |
+
"<INTERNAL_13>": 296,
|
3144 |
+
"<INTERNAL_12>": 297,
|
3145 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3146 |
"<MUTATED>": 299,
|
3147 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
|
|
3158 |
"<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
|
3159 |
"<AUTOENCODER_TASK>": 312,
|
3160 |
"<DECODED_FROM_LATENT>": 313,
|
|
|
|
|
|
|
3161 |
"#": 527,
|
3162 |
"%": 528,
|
3163 |
"(": 529,
|
tokenizer/cell_attributes_tokenizer.json
CHANGED
@@ -77,7 +77,7 @@
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
-
"content": "<
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
@@ -95,7 +95,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
-
"content": "<
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
@@ -104,7 +104,7 @@
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
-
"content": "<
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
@@ -2318,7 +2318,7 @@
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
-
"content": "<
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
@@ -2327,7 +2327,7 @@
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
-
"content": "<
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
@@ -2336,7 +2336,7 @@
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
-
"content": "<
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
@@ -2417,7 +2417,7 @@
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
-
"content": "<
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
@@ -2426,7 +2426,7 @@
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
-
"content": "<
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
@@ -2435,7 +2435,7 @@
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
-
"content": "<
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
@@ -2444,7 +2444,7 @@
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
-
"content": "<
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
@@ -2453,7 +2453,7 @@
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
-
"content": "<
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
@@ -2462,7 +2462,7 @@
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
-
"content": "<
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
@@ -2570,7 +2570,7 @@
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
-
"content": "<
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
@@ -2669,7 +2669,7 @@
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
-
"content": "<
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
@@ -2678,7 +2678,7 @@
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
-
"content": "<
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
@@ -2828,33 +2828,6 @@
|
|
2828 |
"rstrip": false,
|
2829 |
"normalized": false,
|
2830 |
"special": true
|
2831 |
-
},
|
2832 |
-
{
|
2833 |
-
"id": 314,
|
2834 |
-
"content": "<BBBP>",
|
2835 |
-
"single_word": false,
|
2836 |
-
"lstrip": false,
|
2837 |
-
"rstrip": false,
|
2838 |
-
"normalized": false,
|
2839 |
-
"special": true
|
2840 |
-
},
|
2841 |
-
{
|
2842 |
-
"id": 315,
|
2843 |
-
"content": "<FDA_APPR>",
|
2844 |
-
"single_word": false,
|
2845 |
-
"lstrip": false,
|
2846 |
-
"rstrip": false,
|
2847 |
-
"normalized": false,
|
2848 |
-
"special": true
|
2849 |
-
},
|
2850 |
-
{
|
2851 |
-
"id": 316,
|
2852 |
-
"content": "<HIV_ACTIVITY>",
|
2853 |
-
"single_word": false,
|
2854 |
-
"lstrip": false,
|
2855 |
-
"rstrip": false,
|
2856 |
-
"normalized": false,
|
2857 |
-
"special": true
|
2858 |
}
|
2859 |
],
|
2860 |
"normalizer": null,
|
@@ -2884,10 +2857,10 @@
|
|
2884 |
"<EOS>": 5,
|
2885 |
"<MOLECULAR_ENTITY>": 6,
|
2886 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2887 |
-
"<
|
2888 |
"<MOLECULAR_ENTITY_EPITOPE>": 9,
|
2889 |
-
"<
|
2890 |
-
"<
|
2891 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2892 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2893 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
@@ -3133,9 +3106,9 @@
|
|
3133 |
"<SENTINEL_ID_197>": 254,
|
3134 |
"<SENTINEL_ID_198>": 255,
|
3135 |
"<SENTINEL_ID_199>": 256,
|
3136 |
-
"<
|
3137 |
-
"<
|
3138 |
-
"<
|
3139 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3140 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3141 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
@@ -3144,12 +3117,12 @@
|
|
3144 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3145 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3146 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3147 |
-
"<
|
3148 |
-
"<
|
3149 |
-
"<
|
3150 |
-
"<
|
3151 |
-
"<
|
3152 |
-
"<
|
3153 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3154 |
"<TIMESTEP>": 275,
|
3155 |
"<DIFFUSION>": 276,
|
@@ -3161,7 +3134,7 @@
|
|
3161 |
"<BACKSPACE>": 282,
|
3162 |
"<SEQUENCE_NATURAL_START>": 283,
|
3163 |
"<NOOP>": 284,
|
3164 |
-
"<
|
3165 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3166 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3167 |
"<CELL_TYPE_CLASS>": 288,
|
@@ -3172,8 +3145,8 @@
|
|
3172 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3173 |
"<COMPLEX_ENTITY>": 294,
|
3174 |
"<ALTERNATIVE>": 295,
|
3175 |
-
"<
|
3176 |
-
"<
|
3177 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3178 |
"<MUTATED>": 299,
|
3179 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
@@ -3190,9 +3163,6 @@
|
|
3190 |
"<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
|
3191 |
"<AUTOENCODER_TASK>": 312,
|
3192 |
"<DECODED_FROM_LATENT>": 313,
|
3193 |
-
"<BBBP>": 314,
|
3194 |
-
"<FDA_APPR>": 315,
|
3195 |
-
"<HIV_ACTIVITY>": 316,
|
3196 |
"[CL:0000499]": 3522,
|
3197 |
"[CL:2000060]": 3523,
|
3198 |
"[CL:0000235]": 3524,
|
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
+
"content": "<INTERNAL_0>",
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
+
"content": "<INTERNAL_2>",
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
+
"content": "<INTERNAL_3>",
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
+
"content": "<INTERNAL_17>",
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
+
"content": "<INTERNAL_15>",
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
+
"content": "<INTERNAL_16>",
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
+
"content": "<INTERNAL_7>",
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
+
"content": "<INTERNAL_6>",
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
+
"content": "<INTERNAL_9>",
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
+
"content": "<INTERNAL_5>",
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
+
"content": "<INTERNAL_8>",
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
+
"content": "<INTERNAL_4>",
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
+
"content": "<INTERNAL_14>",
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
+
"content": "<INTERNAL_13>",
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
+
"content": "<INTERNAL_12>",
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
|
|
2828 |
"rstrip": false,
|
2829 |
"normalized": false,
|
2830 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2831 |
}
|
2832 |
],
|
2833 |
"normalizer": null,
|
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
+
"<INTERNAL_0>": 8,
|
2861 |
"<MOLECULAR_ENTITY_EPITOPE>": 9,
|
2862 |
+
"<INTERNAL_2>": 10,
|
2863 |
+
"<INTERNAL_3>": 11,
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
+
"<INTERNAL_17>": 257,
|
3110 |
+
"<INTERNAL_15>": 258,
|
3111 |
+
"<INTERNAL_16>": 259,
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
+
"<INTERNAL_7>": 268,
|
3121 |
+
"<INTERNAL_6>": 269,
|
3122 |
+
"<INTERNAL_9>": 270,
|
3123 |
+
"<INTERNAL_5>": 271,
|
3124 |
+
"<INTERNAL_8>": 272,
|
3125 |
+
"<INTERNAL_4>": 273,
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
+
"<INTERNAL_14>": 285,
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
+
"<INTERNAL_13>": 296,
|
3149 |
+
"<INTERNAL_12>": 297,
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
|
|
3163 |
"<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
|
3164 |
"<AUTOENCODER_TASK>": 312,
|
3165 |
"<DECODED_FROM_LATENT>": 313,
|
|
|
|
|
|
|
3166 |
"[CL:0000499]": 3522,
|
3167 |
"[CL:2000060]": 3523,
|
3168 |
"[CL:0000235]": 3524,
|
tokenizer/gene_tokenizer.json
CHANGED
@@ -77,7 +77,7 @@
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
-
"content": "<
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
@@ -95,7 +95,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
-
"content": "<
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
@@ -104,7 +104,7 @@
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
-
"content": "<
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
@@ -2318,7 +2318,7 @@
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
-
"content": "<
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
@@ -2327,7 +2327,7 @@
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
-
"content": "<
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
@@ -2336,7 +2336,7 @@
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
-
"content": "<
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
@@ -2417,7 +2417,7 @@
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
-
"content": "<
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
@@ -2426,7 +2426,7 @@
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
-
"content": "<
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
@@ -2435,7 +2435,7 @@
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
-
"content": "<
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
@@ -2444,7 +2444,7 @@
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
-
"content": "<
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
@@ -2453,7 +2453,7 @@
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
-
"content": "<
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
@@ -2462,7 +2462,7 @@
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
-
"content": "<
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
@@ -2570,7 +2570,7 @@
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
-
"content": "<
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
@@ -2669,7 +2669,7 @@
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
-
"content": "<
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
@@ -2678,7 +2678,7 @@
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
-
"content": "<
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
@@ -2828,33 +2828,6 @@
|
|
2828 |
"rstrip": false,
|
2829 |
"normalized": false,
|
2830 |
"special": true
|
2831 |
-
},
|
2832 |
-
{
|
2833 |
-
"id": 314,
|
2834 |
-
"content": "<BBBP>",
|
2835 |
-
"single_word": false,
|
2836 |
-
"lstrip": false,
|
2837 |
-
"rstrip": false,
|
2838 |
-
"normalized": false,
|
2839 |
-
"special": true
|
2840 |
-
},
|
2841 |
-
{
|
2842 |
-
"id": 315,
|
2843 |
-
"content": "<FDA_APPR>",
|
2844 |
-
"single_word": false,
|
2845 |
-
"lstrip": false,
|
2846 |
-
"rstrip": false,
|
2847 |
-
"normalized": false,
|
2848 |
-
"special": true
|
2849 |
-
},
|
2850 |
-
{
|
2851 |
-
"id": 316,
|
2852 |
-
"content": "<HIV_ACTIVITY>",
|
2853 |
-
"single_word": false,
|
2854 |
-
"lstrip": false,
|
2855 |
-
"rstrip": false,
|
2856 |
-
"normalized": false,
|
2857 |
-
"special": true
|
2858 |
}
|
2859 |
],
|
2860 |
"normalizer": null,
|
@@ -2884,10 +2857,10 @@
|
|
2884 |
"<EOS>": 5,
|
2885 |
"<MOLECULAR_ENTITY>": 6,
|
2886 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2887 |
-
"<
|
2888 |
"<MOLECULAR_ENTITY_EPITOPE>": 9,
|
2889 |
-
"<
|
2890 |
-
"<
|
2891 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2892 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2893 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
@@ -3133,9 +3106,9 @@
|
|
3133 |
"<SENTINEL_ID_197>": 254,
|
3134 |
"<SENTINEL_ID_198>": 255,
|
3135 |
"<SENTINEL_ID_199>": 256,
|
3136 |
-
"<
|
3137 |
-
"<
|
3138 |
-
"<
|
3139 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3140 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3141 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
@@ -3144,12 +3117,12 @@
|
|
3144 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3145 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3146 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3147 |
-
"<
|
3148 |
-
"<
|
3149 |
-
"<
|
3150 |
-
"<
|
3151 |
-
"<
|
3152 |
-
"<
|
3153 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3154 |
"<TIMESTEP>": 275,
|
3155 |
"<DIFFUSION>": 276,
|
@@ -3161,7 +3134,7 @@
|
|
3161 |
"<BACKSPACE>": 282,
|
3162 |
"<SEQUENCE_NATURAL_START>": 283,
|
3163 |
"<NOOP>": 284,
|
3164 |
-
"<
|
3165 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3166 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3167 |
"<CELL_TYPE_CLASS>": 288,
|
@@ -3172,8 +3145,8 @@
|
|
3172 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3173 |
"<COMPLEX_ENTITY>": 294,
|
3174 |
"<ALTERNATIVE>": 295,
|
3175 |
-
"<
|
3176 |
-
"<
|
3177 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3178 |
"<MUTATED>": 299,
|
3179 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
@@ -3190,9 +3163,6 @@
|
|
3190 |
"<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
|
3191 |
"<AUTOENCODER_TASK>": 312,
|
3192 |
"<DECODED_FROM_LATENT>": 313,
|
3193 |
-
"<BBBP>": 314,
|
3194 |
-
"<FDA_APPR>": 315,
|
3195 |
-
"<HIV_ACTIVITY>": 316,
|
3196 |
"[100130093]": 5000,
|
3197 |
"[100133445]": 5001,
|
3198 |
"[100286793]": 5002,
|
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
+
"content": "<INTERNAL_0>",
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
+
"content": "<INTERNAL_2>",
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
+
"content": "<INTERNAL_3>",
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
+
"content": "<INTERNAL_17>",
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
+
"content": "<INTERNAL_15>",
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
+
"content": "<INTERNAL_16>",
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
+
"content": "<INTERNAL_7>",
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
+
"content": "<INTERNAL_6>",
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
+
"content": "<INTERNAL_9>",
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
+
"content": "<INTERNAL_5>",
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
+
"content": "<INTERNAL_8>",
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
+
"content": "<INTERNAL_4>",
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
+
"content": "<INTERNAL_14>",
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
+
"content": "<INTERNAL_13>",
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
+
"content": "<INTERNAL_12>",
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
|
|
2828 |
"rstrip": false,
|
2829 |
"normalized": false,
|
2830 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2831 |
}
|
2832 |
],
|
2833 |
"normalizer": null,
|
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
+
"<INTERNAL_0>": 8,
|
2861 |
"<MOLECULAR_ENTITY_EPITOPE>": 9,
|
2862 |
+
"<INTERNAL_2>": 10,
|
2863 |
+
"<INTERNAL_3>": 11,
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
+
"<INTERNAL_17>": 257,
|
3110 |
+
"<INTERNAL_15>": 258,
|
3111 |
+
"<INTERNAL_16>": 259,
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
+
"<INTERNAL_7>": 268,
|
3121 |
+
"<INTERNAL_6>": 269,
|
3122 |
+
"<INTERNAL_9>": 270,
|
3123 |
+
"<INTERNAL_5>": 271,
|
3124 |
+
"<INTERNAL_8>": 272,
|
3125 |
+
"<INTERNAL_4>": 273,
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
+
"<INTERNAL_14>": 285,
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
+
"<INTERNAL_13>": 296,
|
3149 |
+
"<INTERNAL_12>": 297,
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
|
|
3163 |
"<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
|
3164 |
"<AUTOENCODER_TASK>": 312,
|
3165 |
"<DECODED_FROM_LATENT>": 313,
|
|
|
|
|
|
|
3166 |
"[100130093]": 5000,
|
3167 |
"[100133445]": 5001,
|
3168 |
"[100286793]": 5002,
|
tokenizer/t5_tokenizer_AA_special.json
CHANGED
@@ -77,7 +77,7 @@
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
-
"content": "<
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
@@ -95,7 +95,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
-
"content": "<
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
@@ -104,7 +104,7 @@
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
-
"content": "<
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
@@ -2318,7 +2318,7 @@
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
-
"content": "<
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
@@ -2327,7 +2327,7 @@
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
-
"content": "<
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
@@ -2336,7 +2336,7 @@
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
-
"content": "<
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
@@ -2417,7 +2417,7 @@
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
-
"content": "<
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
@@ -2426,7 +2426,7 @@
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
-
"content": "<
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
@@ -2435,7 +2435,7 @@
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
-
"content": "<
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
@@ -2444,7 +2444,7 @@
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
-
"content": "<
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
@@ -2453,7 +2453,7 @@
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
-
"content": "<
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
@@ -2462,7 +2462,7 @@
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
-
"content": "<
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
@@ -2570,7 +2570,7 @@
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
-
"content": "<
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
@@ -2669,7 +2669,7 @@
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
-
"content": "<
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
@@ -2678,7 +2678,7 @@
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
-
"content": "<
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
@@ -2828,33 +2828,6 @@
|
|
2828 |
"rstrip": false,
|
2829 |
"normalized": false,
|
2830 |
"special": true
|
2831 |
-
},
|
2832 |
-
{
|
2833 |
-
"id": 314,
|
2834 |
-
"content": "<BBBP>",
|
2835 |
-
"single_word": false,
|
2836 |
-
"lstrip": false,
|
2837 |
-
"rstrip": false,
|
2838 |
-
"normalized": false,
|
2839 |
-
"special": true
|
2840 |
-
},
|
2841 |
-
{
|
2842 |
-
"id": 315,
|
2843 |
-
"content": "<FDA_APPR>",
|
2844 |
-
"single_word": false,
|
2845 |
-
"lstrip": false,
|
2846 |
-
"rstrip": false,
|
2847 |
-
"normalized": false,
|
2848 |
-
"special": true
|
2849 |
-
},
|
2850 |
-
{
|
2851 |
-
"id": 316,
|
2852 |
-
"content": "<HIV_ACTIVITY>",
|
2853 |
-
"single_word": false,
|
2854 |
-
"lstrip": false,
|
2855 |
-
"rstrip": false,
|
2856 |
-
"normalized": false,
|
2857 |
-
"special": true
|
2858 |
}
|
2859 |
],
|
2860 |
"normalizer": null,
|
@@ -2884,10 +2857,10 @@
|
|
2884 |
"<EOS>": 5,
|
2885 |
"<MOLECULAR_ENTITY>": 6,
|
2886 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2887 |
-
"<
|
2888 |
"<MOLECULAR_ENTITY_EPITOPE>": 9,
|
2889 |
-
"<
|
2890 |
-
"<
|
2891 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2892 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2893 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
@@ -3133,9 +3106,9 @@
|
|
3133 |
"<SENTINEL_ID_197>": 254,
|
3134 |
"<SENTINEL_ID_198>": 255,
|
3135 |
"<SENTINEL_ID_199>": 256,
|
3136 |
-
"<
|
3137 |
-
"<
|
3138 |
-
"<
|
3139 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3140 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3141 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
@@ -3144,12 +3117,12 @@
|
|
3144 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3145 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3146 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3147 |
-
"<
|
3148 |
-
"<
|
3149 |
-
"<
|
3150 |
-
"<
|
3151 |
-
"<
|
3152 |
-
"<
|
3153 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3154 |
"<TIMESTEP>": 275,
|
3155 |
"<DIFFUSION>": 276,
|
@@ -3161,7 +3134,7 @@
|
|
3161 |
"<BACKSPACE>": 282,
|
3162 |
"<SEQUENCE_NATURAL_START>": 283,
|
3163 |
"<NOOP>": 284,
|
3164 |
-
"<
|
3165 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3166 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3167 |
"<CELL_TYPE_CLASS>": 288,
|
@@ -3172,8 +3145,8 @@
|
|
3172 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3173 |
"<COMPLEX_ENTITY>": 294,
|
3174 |
"<ALTERNATIVE>": 295,
|
3175 |
-
"<
|
3176 |
-
"<
|
3177 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3178 |
"<MUTATED>": 299,
|
3179 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
@@ -3190,9 +3163,6 @@
|
|
3190 |
"<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
|
3191 |
"<AUTOENCODER_TASK>": 312,
|
3192 |
"<DECODED_FROM_LATENT>": 313,
|
3193 |
-
"<BBBP>": 314,
|
3194 |
-
"<FDA_APPR>": 315,
|
3195 |
-
"<HIV_ACTIVITY>": 316,
|
3196 |
"A": 501,
|
3197 |
"B": 502,
|
3198 |
"C": 503,
|
|
|
77 |
},
|
78 |
{
|
79 |
"id": 8,
|
80 |
+
"content": "<INTERNAL_0>",
|
81 |
"single_word": false,
|
82 |
"lstrip": false,
|
83 |
"rstrip": false,
|
|
|
95 |
},
|
96 |
{
|
97 |
"id": 10,
|
98 |
+
"content": "<INTERNAL_2>",
|
99 |
"single_word": false,
|
100 |
"lstrip": false,
|
101 |
"rstrip": false,
|
|
|
104 |
},
|
105 |
{
|
106 |
"id": 11,
|
107 |
+
"content": "<INTERNAL_3>",
|
108 |
"single_word": false,
|
109 |
"lstrip": false,
|
110 |
"rstrip": false,
|
|
|
2318 |
},
|
2319 |
{
|
2320 |
"id": 257,
|
2321 |
+
"content": "<INTERNAL_17>",
|
2322 |
"single_word": false,
|
2323 |
"lstrip": false,
|
2324 |
"rstrip": false,
|
|
|
2327 |
},
|
2328 |
{
|
2329 |
"id": 258,
|
2330 |
+
"content": "<INTERNAL_15>",
|
2331 |
"single_word": false,
|
2332 |
"lstrip": false,
|
2333 |
"rstrip": false,
|
|
|
2336 |
},
|
2337 |
{
|
2338 |
"id": 259,
|
2339 |
+
"content": "<INTERNAL_16>",
|
2340 |
"single_word": false,
|
2341 |
"lstrip": false,
|
2342 |
"rstrip": false,
|
|
|
2417 |
},
|
2418 |
{
|
2419 |
"id": 268,
|
2420 |
+
"content": "<INTERNAL_7>",
|
2421 |
"single_word": false,
|
2422 |
"lstrip": false,
|
2423 |
"rstrip": false,
|
|
|
2426 |
},
|
2427 |
{
|
2428 |
"id": 269,
|
2429 |
+
"content": "<INTERNAL_6>",
|
2430 |
"single_word": false,
|
2431 |
"lstrip": false,
|
2432 |
"rstrip": false,
|
|
|
2435 |
},
|
2436 |
{
|
2437 |
"id": 270,
|
2438 |
+
"content": "<INTERNAL_9>",
|
2439 |
"single_word": false,
|
2440 |
"lstrip": false,
|
2441 |
"rstrip": false,
|
|
|
2444 |
},
|
2445 |
{
|
2446 |
"id": 271,
|
2447 |
+
"content": "<INTERNAL_5>",
|
2448 |
"single_word": false,
|
2449 |
"lstrip": false,
|
2450 |
"rstrip": false,
|
|
|
2453 |
},
|
2454 |
{
|
2455 |
"id": 272,
|
2456 |
+
"content": "<INTERNAL_8>",
|
2457 |
"single_word": false,
|
2458 |
"lstrip": false,
|
2459 |
"rstrip": false,
|
|
|
2462 |
},
|
2463 |
{
|
2464 |
"id": 273,
|
2465 |
+
"content": "<INTERNAL_4>",
|
2466 |
"single_word": false,
|
2467 |
"lstrip": false,
|
2468 |
"rstrip": false,
|
|
|
2570 |
},
|
2571 |
{
|
2572 |
"id": 285,
|
2573 |
+
"content": "<INTERNAL_14>",
|
2574 |
"single_word": false,
|
2575 |
"lstrip": false,
|
2576 |
"rstrip": false,
|
|
|
2669 |
},
|
2670 |
{
|
2671 |
"id": 296,
|
2672 |
+
"content": "<INTERNAL_13>",
|
2673 |
"single_word": false,
|
2674 |
"lstrip": false,
|
2675 |
"rstrip": false,
|
|
|
2678 |
},
|
2679 |
{
|
2680 |
"id": 297,
|
2681 |
+
"content": "<INTERNAL_12>",
|
2682 |
"single_word": false,
|
2683 |
"lstrip": false,
|
2684 |
"rstrip": false,
|
|
|
2828 |
"rstrip": false,
|
2829 |
"normalized": false,
|
2830 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2831 |
}
|
2832 |
],
|
2833 |
"normalizer": null,
|
|
|
2857 |
"<EOS>": 5,
|
2858 |
"<MOLECULAR_ENTITY>": 6,
|
2859 |
"<GLOBAL_INTERACTION_ATTRIBUTES>": 7,
|
2860 |
+
"<INTERNAL_0>": 8,
|
2861 |
"<MOLECULAR_ENTITY_EPITOPE>": 9,
|
2862 |
+
"<INTERNAL_2>": 10,
|
2863 |
+
"<INTERNAL_3>": 11,
|
2864 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CHAIN>": 12,
|
2865 |
"<MOLECULAR_ENTITY_TCR_BETA_VDJ>": 13,
|
2866 |
"<MOLECULAR_ENTITY_TCR_BETA_CDR3>": 14,
|
|
|
3106 |
"<SENTINEL_ID_197>": 254,
|
3107 |
"<SENTINEL_ID_198>": 255,
|
3108 |
"<SENTINEL_ID_199>": 256,
|
3109 |
+
"<INTERNAL_17>": 257,
|
3110 |
+
"<INTERNAL_15>": 258,
|
3111 |
+
"<INTERNAL_16>": 259,
|
3112 |
"<ATTRIBUTE_ORGANISM>": 260,
|
3113 |
"<ATTRIBUTE_ORGANISM_HUMAN>": 261,
|
3114 |
"<ATTRIBUTE_ORGANISM_RABBIT>": 262,
|
|
|
3117 |
"<ATTRIBUTE_ORGANISM_MONKEY>": 265,
|
3118 |
"<ATTRIBUTE_ORGANISM_CAMEL>": 266,
|
3119 |
"<EPITOPE_PARATOPE_PREDICTION>": 267,
|
3120 |
+
"<INTERNAL_7>": 268,
|
3121 |
+
"<INTERNAL_6>": 269,
|
3122 |
+
"<INTERNAL_9>": 270,
|
3123 |
+
"<INTERNAL_5>": 271,
|
3124 |
+
"<INTERNAL_8>": 272,
|
3125 |
+
"<INTERNAL_4>": 273,
|
3126 |
"<MOLECULAR_ENTITY_GENERAL_PROTEIN>": 274,
|
3127 |
"<TIMESTEP>": 275,
|
3128 |
"<DIFFUSION>": 276,
|
|
|
3134 |
"<BACKSPACE>": 282,
|
3135 |
"<SEQUENCE_NATURAL_START>": 283,
|
3136 |
"<NOOP>": 284,
|
3137 |
+
"<INTERNAL_14>": 285,
|
3138 |
"<MOLECULAR_ENTITY_SMALL_MOLECULE>": 286,
|
3139 |
"<MOLECULAR_ENTITY_CELL_GENE_EXPRESSION_RANKED>": 287,
|
3140 |
"<CELL_TYPE_CLASS>": 288,
|
|
|
3145 |
"<MOLECULAR_ENTITY_PROTEIN_CHAIN>": 293,
|
3146 |
"<COMPLEX_ENTITY>": 294,
|
3147 |
"<ALTERNATIVE>": 295,
|
3148 |
+
"<INTERNAL_13>": 296,
|
3149 |
+
"<INTERNAL_12>": 297,
|
3150 |
"<SUBMOLECULAR_ENTITY>": 298,
|
3151 |
"<MUTATED>": 299,
|
3152 |
"<MOLECULAR_ENTITY_TCR_ALPHA_CDR3>": 300,
|
|
|
3163 |
"<AUTOENCODER_LATENT_SAMPLED_Z>": 311,
|
3164 |
"<AUTOENCODER_TASK>": 312,
|
3165 |
"<DECODED_FROM_LATENT>": 313,
|
|
|
|
|
|
|
3166 |
"A": 501,
|
3167 |
"B": 502,
|
3168 |
"C": 503,
|