adibvafa
/

CodonTransformer

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "[CLS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "[MASK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "Lowercase"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
+      {
+        "type": "Split",
+        "pattern": {
+          "String": " "
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Whitespace"
+      }
+    ]
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "[CLS]": {
+        "id": "[CLS]",
+        "ids": [
+          1
+        ],
+        "tokens": [
+          "[CLS]"
+        ]
+      },
+      "[SEP]": {
+        "id": "[SEP]",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "[SEP]"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordPiece",
+    "unk_token": "[UNK]",
+    "continuing_subword_prefix": "##",
+    "max_input_chars_per_word": 100,
+    "vocab": {
+      "[UNK]": 0,
+      "[CLS]": 1,
+      "[SEP]": 2,
+      "[PAD]": 3,
+      "[MASK]": 4,
+      "a_unk": 5,
+      "c_unk": 6,
+      "d_unk": 7,
+      "e_unk": 8,
+      "f_unk": 9,
+      "g_unk": 10,
+      "h_unk": 11,
+      "i_unk": 12,
+      "k_unk": 13,
+      "l_unk": 14,
+      "m_unk": 15,
+      "n_unk": 16,
+      "p_unk": 17,
+      "q_unk": 18,
+      "r_unk": 19,
+      "s_unk": 20,
+      "t_unk": 21,
+      "v_unk": 22,
+      "w_unk": 23,
+      "y_unk": 24,
+      "__unk": 25,
+      "k_aaa": 26,
+      "n_aac": 27,
+      "k_aag": 28,
+      "n_aat": 29,
+      "t_aca": 30,
+      "t_acc": 31,
+      "t_acg": 32,
+      "t_act": 33,
+      "r_aga": 34,
+      "s_agc": 35,
+      "r_agg": 36,
+      "s_agt": 37,
+      "i_ata": 38,
+      "i_atc": 39,
+      "m_atg": 40,
+      "i_att": 41,
+      "q_caa": 42,
+      "h_cac": 43,
+      "q_cag": 44,
+      "h_cat": 45,
+      "p_cca": 46,
+      "p_ccc": 47,
+      "p_ccg": 48,
+      "p_cct": 49,
+      "r_cga": 50,
+      "r_cgc": 51,
+      "r_cgg": 52,
+      "r_cgt": 53,
+      "l_cta": 54,
+      "l_ctc": 55,
+      "l_ctg": 56,
+      "l_ctt": 57,
+      "e_gaa": 58,
+      "d_gac": 59,
+      "e_gag": 60,
+      "d_gat": 61,
+      "a_gca": 62,
+      "a_gcc": 63,
+      "a_gcg": 64,
+      "a_gct": 65,
+      "g_gga": 66,
+      "g_ggc": 67,
+      "g_ggg": 68,
+      "g_ggt": 69,
+      "v_gta": 70,
+      "v_gtc": 71,
+      "v_gtg": 72,
+      "v_gtt": 73,
+      "__taa": 74,
+      "y_tac": 75,
+      "__tag": 76,
+      "y_tat": 77,
+      "s_tca": 78,
+      "s_tcc": 79,
+      "s_tcg": 80,
+      "s_tct": 81,
+      "__tga": 82,
+      "c_tgc": 83,
+      "w_tgg": 84,
+      "c_tgt": 85,
+      "l_tta": 86,
+      "f_ttc": 87,
+      "l_ttg": 88,
+      "f_ttt": 89
+    }
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "[UNK]"
+}