ming030890 commited on
Commit
b7ddfa3
1 Parent(s): 6156925

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -1
  2. tokenizer.json +75 -9
  3. tokenizer_config.json +16 -0
special_tokens_map.json CHANGED
@@ -15,7 +15,13 @@
15
  "rstrip": false,
16
  "single_word": false
17
  },
18
- "sep_token": "</s>",
 
 
 
 
 
 
19
  "unk_token": {
20
  "content": "<unk>",
21
  "lstrip": false,
 
15
  "rstrip": false,
16
  "single_word": false
17
  },
18
+ "sep_token": {
19
+ "content": "</s>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
  "unk_token": {
26
  "content": "<unk>",
27
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
@@ -12,6 +17,15 @@
12
  "normalized": false,
13
  "special": true
14
  },
 
 
 
 
 
 
 
 
 
15
  {
16
  "id": 2,
17
  "content": "</s>",
@@ -21,6 +35,15 @@
21
  "normalized": false,
22
  "special": true
23
  },
 
 
 
 
 
 
 
 
 
24
  {
25
  "id": 15999,
26
  "content": "<pad>",
@@ -59,15 +82,58 @@
59
  "split": true
60
  },
61
  "post_processor": {
62
- "type": "BertProcessing",
63
- "sep": [
64
- "<s>",
65
- 1
 
 
 
 
 
 
 
 
 
 
66
  ],
67
- "cls": [
68
- "</s>",
69
- 2
70
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  },
72
  "decoder": {
73
  "type": "Metaspace",
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 128,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
17
  "normalized": false,
18
  "special": true
19
  },
20
+ {
21
+ "id": 1,
22
+ "content": "<s>",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
  {
30
  "id": 2,
31
  "content": "</s>",
 
35
  "normalized": false,
36
  "special": true
37
  },
38
+ {
39
+ "id": 3,
40
+ "content": "<mask>",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
  {
48
  "id": 15999,
49
  "content": "<pad>",
 
82
  "split": true
83
  },
84
  "post_processor": {
85
+ "type": "TemplateProcessing",
86
+ "single": [
87
+ {
88
+ "Sequence": {
89
+ "id": "A",
90
+ "type_id": 0
91
+ }
92
+ },
93
+ {
94
+ "SpecialToken": {
95
+ "id": "</s>",
96
+ "type_id": 0
97
+ }
98
+ }
99
  ],
100
+ "pair": [
101
+ {
102
+ "Sequence": {
103
+ "id": "A",
104
+ "type_id": 0
105
+ }
106
+ },
107
+ {
108
+ "SpecialToken": {
109
+ "id": "</s>",
110
+ "type_id": 0
111
+ }
112
+ },
113
+ {
114
+ "Sequence": {
115
+ "id": "B",
116
+ "type_id": 0
117
+ }
118
+ },
119
+ {
120
+ "SpecialToken": {
121
+ "id": "</s>",
122
+ "type_id": 0
123
+ }
124
+ }
125
+ ],
126
+ "special_tokens": {
127
+ "</s>": {
128
+ "id": "</s>",
129
+ "ids": [
130
+ 2
131
+ ],
132
+ "tokens": [
133
+ "</s>"
134
+ ]
135
+ }
136
+ }
137
  },
138
  "decoder": {
139
  "type": "Metaspace",
tokenizer_config.json CHANGED
@@ -9,6 +9,14 @@
9
  "single_word": false,
10
  "special": true
11
  },
 
 
 
 
 
 
 
 
12
  "2": {
13
  "content": "</s>",
14
  "lstrip": false,
@@ -17,6 +25,14 @@
17
  "single_word": false,
18
  "special": true
19
  },
 
 
 
 
 
 
 
 
20
  "15999": {
21
  "content": "<pad>",
22
  "lstrip": false,
 
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "1": {
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
  "2": {
21
  "content": "</s>",
22
  "lstrip": false,
 
25
  "single_word": false,
26
  "special": true
27
  },
28
+ "3": {
29
+ "content": "<mask>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
  "15999": {
37
  "content": "<pad>",
38
  "lstrip": false,