Add custom processor
Browse filesAdd a custom processor to parse HTML.
- tokenizer_config.json +3 -0
tokenizer_config.json
CHANGED
@@ -68,12 +68,14 @@
|
|
68 |
"errors": "replace",
|
69 |
"mask_token": "<mask>",
|
70 |
"max_depth": 50,
|
|
|
71 |
"max_width": 1000,
|
72 |
"model_max_length": 512,
|
73 |
"only_label_first_subword": true,
|
74 |
"pad_token": "<pad>",
|
75 |
"pad_token_label": -100,
|
76 |
"pad_width": 1001,
|
|
|
77 |
"processor_class": "MarkupLMPhishProcessor",
|
78 |
"sep_token": "</s>",
|
79 |
"tags_dict": {
|
@@ -295,5 +297,6 @@
|
|
295 |
},
|
296 |
"tokenizer_class": "MarkupLMTokenizer",
|
297 |
"trim_offsets": false,
|
|
|
298 |
"unk_token": "<unk>"
|
299 |
}
|
|
|
68 |
"errors": "replace",
|
69 |
"mask_token": "<mask>",
|
70 |
"max_depth": 50,
|
71 |
+
"max_length": 512,
|
72 |
"max_width": 1000,
|
73 |
"model_max_length": 512,
|
74 |
"only_label_first_subword": true,
|
75 |
"pad_token": "<pad>",
|
76 |
"pad_token_label": -100,
|
77 |
"pad_width": 1001,
|
78 |
+
"padding": "max_length",
|
79 |
"processor_class": "MarkupLMPhishProcessor",
|
80 |
"sep_token": "</s>",
|
81 |
"tags_dict": {
|
|
|
297 |
},
|
298 |
"tokenizer_class": "MarkupLMTokenizer",
|
299 |
"trim_offsets": false,
|
300 |
+
"truncation": true,
|
301 |
"unk_token": "<unk>"
|
302 |
}
|