zpn commited on
Commit
423f3bd
1 Parent(s): 0966411

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +128 -0
  3. tokenizer_config.json +9 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SEP]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[PAD]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": null,
54
+ "post_processor": {
55
+ "type": "TemplateProcessing",
56
+ "single": [
57
+ {
58
+ "SpecialToken": {
59
+ "id": "[CLS]",
60
+ "type_id": 0
61
+ }
62
+ },
63
+ {
64
+ "Sequence": {
65
+ "id": "A",
66
+ "type_id": 0
67
+ }
68
+ },
69
+ {
70
+ "SpecialToken": {
71
+ "id": "[SEP]",
72
+ "type_id": 0
73
+ }
74
+ }
75
+ ],
76
+ "pair": [
77
+ {
78
+ "Sequence": {
79
+ "id": "A",
80
+ "type_id": 0
81
+ }
82
+ },
83
+ {
84
+ "Sequence": {
85
+ "id": "B",
86
+ "type_id": 1
87
+ }
88
+ }
89
+ ],
90
+ "special_tokens": {
91
+ "[CLS]": {
92
+ "id": "[CLS]",
93
+ "ids": [
94
+ 1
95
+ ],
96
+ "tokens": [
97
+ "[CLS]"
98
+ ]
99
+ },
100
+ "[SEP]": {
101
+ "id": "[SEP]",
102
+ "ids": [
103
+ 2
104
+ ],
105
+ "tokens": [
106
+ "[SEP]"
107
+ ]
108
+ }
109
+ }
110
+ },
111
+ "decoder": null,
112
+ "model": {
113
+ "type": "WordLevel",
114
+ "vocab": {
115
+ "[UNK]": 0,
116
+ "[CLS]": 1,
117
+ "[SEP]": 2,
118
+ "[PAD]": 3,
119
+ "[MASK]": 4,
120
+ "A": 5,
121
+ "C": 6,
122
+ "G": 7,
123
+ "N": 8,
124
+ "T": 9
125
+ },
126
+ "unk_token": "[UNK]"
127
+ }
128
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "model_max_length": 1000,
5
+ "pad_token": "[PAD]",
6
+ "sep_token": "[SEP]",
7
+ "tokenizer_class": "PreTrainedTokenizerFast",
8
+ "unk_token": "[UNK]"
9
+ }