Add tokenizer config

Files changed (3) hide show

README.md CHANGED Viewed

@@ -33,7 +33,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 input_sequences = ["EACU"*20, "EAUG"*20, "EAUG"*20, "EACU"*20, "EAUU"*20]
 helix_mrna_config = HelixmRNAConfig(batch_size=5, device=device, max_length=100)
-helix_mrna = HelixmRNA()
 # prepare data for input to the model
 processed_input_data = helix_mrna.process_data(input_sequences)
@@ -53,8 +53,8 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 input_sequences = ["EACU"*20, "EAUG"*20, "EAUG"*20, "EACU"*20, "EAUU"*20]
 labels = [0, 2, 2, 0, 1]
-helixr_config = HelixmRNAConfig(batch_size=5, device=device)
-helixr_fine_tune = HelixmRNAFineTuningModel(helix_mrna_config=helixr_config, output_size=3, max_length=100)
 train_dataset = helixr_fine_tune.process_data(input_sequences)

 input_sequences = ["EACU"*20, "EAUG"*20, "EAUG"*20, "EACU"*20, "EAUU"*20]
 helix_mrna_config = HelixmRNAConfig(batch_size=5, device=device, max_length=100)
+helix_mrna = HelixmRNA(configurer=helix_mrna_config)
 # prepare data for input to the model
 processed_input_data = helix_mrna.process_data(input_sequences)
 input_sequences = ["EACU"*20, "EAUG"*20, "EAUG"*20, "EACU"*20, "EAUU"*20]
 labels = [0, 2, 2, 0, 1]
+helixr_config = HelixmRNAConfig(batch_size=5, device=device, max_length=100)
+helixr_fine_tune = HelixmRNAFineTuningModel(helix_mrna_config=helixr_config, fine_tuning_head="classification", output_size=3)
 train_dataset = helixr_fine_tune.process_data(input_sequences)

special_tokens_map.json ADDED Viewed

+{
+    "bos_token": "[BOS]",
+    "cls_token": "[CLS]",
+    "eos_token": "[SEP]",
+    "mask_token": "[MASK]",
+    "pad_token": "[PAD]",
+    "sep_token": "[SEP]",
+    "unk_token": "[UNK]"
+  }

tokenizer_config.json ADDED Viewed

+{
+    "add_prefix_space": false,
+    "added_tokens_decoder": {
+      "0": {
+        "content": "[BOS]",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "1": {
+        "content": "[PAD]",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "2": {
+        "content": "[CLS]",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "3": {
+        "content": "[MASK]",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "6": {
+        "content": "[UNK]",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      }
+    },
+    "bos_token": "[BOS]",
+    "characters": [
+      "A",
+      "C",
+      "G",
+      "U",
+      "N",
+      "E",
+      "T"
+    ],
+    "clean_up_tokenization_spaces": false,
+    "cls_token": "[CLS]",
+    "eos_token": "[SEP]",
+    "mask_token": "[MASK]",
+    "model_max_length": 12288,
+    "pad_token": "[PAD]",
+    "padding_side": "left",
+    "sep_token": "[SEP]",
+    "tokenizer_class": "CharTokenizer",
+    "unk_token": "[UNK]"
+  }