jhn9803 commited on
Commit
c05a070
1 Parent(s): 5e43117

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +9 -0
  2. tokenizer.json +160 -0
  3. tokenizer_config.json +18 -0
  4. vocab.txt +5 -0
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": null,
10
+ "added_tokens": [
11
+ {
12
+ "id": 0,
13
+ "content": "[CLS]",
14
+ "single_word": false,
15
+ "lstrip": false,
16
+ "rstrip": false,
17
+ "normalized": false,
18
+ "special": true
19
+ },
20
+ {
21
+ "id": 1,
22
+ "content": "[PAD]",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 2,
31
+ "content": "[SEP]",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 3,
40
+ "content": "[UNK]",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 4,
49
+ "content": "[MASK]",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ }
56
+ ],
57
+ "normalizer": {
58
+ "type": "BertNormalizer",
59
+ "clean_text": true,
60
+ "handle_chinese_chars": true,
61
+ "strip_accents": null,
62
+ "lowercase": false
63
+ },
64
+ "pre_tokenizer": {
65
+ "type": "BertPreTokenizer"
66
+ },
67
+ "post_processor": {
68
+ "type": "TemplateProcessing",
69
+ "single": [
70
+ {
71
+ "SpecialToken": {
72
+ "id": "[CLS]",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "Sequence": {
78
+ "id": "A",
79
+ "type_id": 0
80
+ }
81
+ },
82
+ {
83
+ "SpecialToken": {
84
+ "id": "[SEP]",
85
+ "type_id": 0
86
+ }
87
+ }
88
+ ],
89
+ "pair": [
90
+ {
91
+ "SpecialToken": {
92
+ "id": "[CLS]",
93
+ "type_id": 0
94
+ }
95
+ },
96
+ {
97
+ "Sequence": {
98
+ "id": "A",
99
+ "type_id": 0
100
+ }
101
+ },
102
+ {
103
+ "SpecialToken": {
104
+ "id": "[SEP]",
105
+ "type_id": 0
106
+ }
107
+ },
108
+ {
109
+ "Sequence": {
110
+ "id": "B",
111
+ "type_id": 0
112
+ }
113
+ },
114
+ {
115
+ "SpecialToken": {
116
+ "id": "[SEP]",
117
+ "type_id": 0
118
+ }
119
+ }
120
+ ],
121
+ "special_tokens": {
122
+ "[CLS]": {
123
+ "id": "[CLS]",
124
+ "ids": [
125
+ 0
126
+ ],
127
+ "tokens": [
128
+ "[CLS]"
129
+ ]
130
+ },
131
+ "[SEP]": {
132
+ "id": "[SEP]",
133
+ "ids": [
134
+ 2
135
+ ],
136
+ "tokens": [
137
+ "[SEP]"
138
+ ]
139
+ }
140
+ }
141
+ },
142
+ "decoder": {
143
+ "type": "WordPiece",
144
+ "prefix": "##",
145
+ "cleanup": true
146
+ },
147
+ "model": {
148
+ "type": "WordPiece",
149
+ "unk_token": "[UNK]",
150
+ "continuing_subword_prefix": "##",
151
+ "max_input_chars_per_word": 100,
152
+ "vocab": {
153
+ "[CLS]": 0,
154
+ "[PAD]": 1,
155
+ "[SEP]": 2,
156
+ "[UNK]": 3,
157
+ "[MASK]": 4
158
+ }
159
+ }
160
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": false,
6
+ "eos_token": "[SEP]",
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "name_or_path": "jhn9803/roberta-large-law-contract-tokenizer",
10
+ "never_split": null,
11
+ "pad_token": "[PAD]",
12
+ "sep_token": "[SEP]",
13
+ "special_tokens_map_file": "/root/.cache/huggingface/hub/models--klue--roberta-large/snapshots/5193b95701189160c45d02a1033a4ea55bdbe259/special_tokens_map.json",
14
+ "strip_accents": null,
15
+ "tokenize_chinese_chars": true,
16
+ "tokenizer_class": "BertTokenizer",
17
+ "unk_token": "[UNK]"
18
+ }
vocab.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [CLS]
2
+ [PAD]
3
+ [SEP]
4
+ [UNK]
5
+ [MASK]