pogzyb commited on
Commit
7c0cb84
1 Parent(s): da0a061

Add custom processor

Browse files

Add a custom processor to parse HTML.

Files changed (1) hide show
  1. tokenizer_config.json +3 -0
tokenizer_config.json CHANGED
@@ -68,12 +68,14 @@
68
  "errors": "replace",
69
  "mask_token": "<mask>",
70
  "max_depth": 50,
 
71
  "max_width": 1000,
72
  "model_max_length": 512,
73
  "only_label_first_subword": true,
74
  "pad_token": "<pad>",
75
  "pad_token_label": -100,
76
  "pad_width": 1001,
 
77
  "processor_class": "MarkupLMPhishProcessor",
78
  "sep_token": "</s>",
79
  "tags_dict": {
@@ -295,5 +297,6 @@
295
  },
296
  "tokenizer_class": "MarkupLMTokenizer",
297
  "trim_offsets": false,
 
298
  "unk_token": "<unk>"
299
  }
 
68
  "errors": "replace",
69
  "mask_token": "<mask>",
70
  "max_depth": 50,
71
+ "max_length": 512,
72
  "max_width": 1000,
73
  "model_max_length": 512,
74
  "only_label_first_subword": true,
75
  "pad_token": "<pad>",
76
  "pad_token_label": -100,
77
  "pad_width": 1001,
78
+ "padding": "max_length",
79
  "processor_class": "MarkupLMPhishProcessor",
80
  "sep_token": "</s>",
81
  "tags_dict": {
 
297
  },
298
  "tokenizer_class": "MarkupLMTokenizer",
299
  "trim_offsets": false,
300
+ "truncation": true,
301
  "unk_token": "<unk>"
302
  }