mattwoodx commited on
Commit
faecb10
1 Parent(s): c8a2f3d

Add tokenizer config

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. special_tokens_map.json +9 -0
  3. tokenizer_config.json +65 -0
README.md CHANGED
@@ -33,7 +33,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
33
  input_sequences = ["EACU"*20, "EAUG"*20, "EAUG"*20, "EACU"*20, "EAUU"*20]
34
 
35
  helix_mrna_config = HelixmRNAConfig(batch_size=5, device=device, max_length=100)
36
- helix_mrna = HelixmRNA()
37
 
38
  # prepare data for input to the model
39
  processed_input_data = helix_mrna.process_data(input_sequences)
@@ -53,8 +53,8 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
53
  input_sequences = ["EACU"*20, "EAUG"*20, "EAUG"*20, "EACU"*20, "EAUU"*20]
54
  labels = [0, 2, 2, 0, 1]
55
 
56
- helixr_config = HelixmRNAConfig(batch_size=5, device=device)
57
- helixr_fine_tune = HelixmRNAFineTuningModel(helix_mrna_config=helixr_config, output_size=3, max_length=100)
58
 
59
  train_dataset = helixr_fine_tune.process_data(input_sequences)
60
 
 
33
  input_sequences = ["EACU"*20, "EAUG"*20, "EAUG"*20, "EACU"*20, "EAUU"*20]
34
 
35
  helix_mrna_config = HelixmRNAConfig(batch_size=5, device=device, max_length=100)
36
+ helix_mrna = HelixmRNA(configurer=helix_mrna_config)
37
 
38
  # prepare data for input to the model
39
  processed_input_data = helix_mrna.process_data(input_sequences)
 
53
  input_sequences = ["EACU"*20, "EAUG"*20, "EAUG"*20, "EACU"*20, "EAUU"*20]
54
  labels = [0, 2, 2, 0, 1]
55
 
56
+ helixr_config = HelixmRNAConfig(batch_size=5, device=device, max_length=100)
57
+ helixr_fine_tune = HelixmRNAFineTuningModel(helix_mrna_config=helixr_config, fine_tuning_head="classification", output_size=3)
58
 
59
  train_dataset = helixr_fine_tune.process_data(input_sequences)
60
 
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "[BOS]",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "[PAD]",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "[CLS]",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "[MASK]",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "6": {
37
+ "content": "[UNK]",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "[BOS]",
46
+ "characters": [
47
+ "A",
48
+ "C",
49
+ "G",
50
+ "U",
51
+ "N",
52
+ "E",
53
+ "T"
54
+ ],
55
+ "clean_up_tokenization_spaces": false,
56
+ "cls_token": "[CLS]",
57
+ "eos_token": "[SEP]",
58
+ "mask_token": "[MASK]",
59
+ "model_max_length": 12288,
60
+ "pad_token": "[PAD]",
61
+ "padding_side": "left",
62
+ "sep_token": "[SEP]",
63
+ "tokenizer_class": "CharTokenizer",
64
+ "unk_token": "[UNK]"
65
+ }