Add custom processor

Browse files

Add a custom processor to parse HTML.

Files changed (9) hide show

added_tokens.json +4 -0
merges.txt +0 -0
preprocessor_config.json +7 -0
processor.py +96 -0
processor_config.json +6 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer_config.json +299 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<end-of-node>": 50266,
+  "[empty-title]": 50265
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processor.MarkupLMPhishProcessor"
+  },
+  "feature_extractor_type": "MarkupLMFeatureExtractor",
+  "processor_class": "MarkupLMPhishProcessor"
+}

processor.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import Optional, Union
+import transformers
+from bs4 import BeautifulSoup
+class MarkupLMPhishProcessor(transformers.MarkupLMProcessor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.keep_tags_ctx = [
+            "html",
+            "head",
+            "body",
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            "p",
+            "a",
+            "button",
+            "span",
+            "div",
+            "iframe",
+            "table",
+        ]
+    def _preprocess(self, html_string: str):
+        # Most webpages are huge. BERT's "attention" is limited to 512 tokens.
+        # In order to give the model more context to work with, we strip extraneous
+        # tags/content from the page to help with the binary classification task.
+        soup = BeautifulSoup(html_string, "html.parser")
+        for tag in soup.find_all(True):
+            if tag.name in ("style", "script"):
+                # keep the meaning of the tag, but remove its contents to save space
+                tag.string = ""
+            elif tag.name not in self.keep_tags_ctx:
+                # remove tag, but keep its contents
+                tag.unwrap()
+        return str(soup)
+    def __call__(
+        self,
+        html_strings=None,
+        nodes=None,
+        xpaths=None,
+        node_labels=None,
+        questions=None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = False,
+        truncation: Union[
+            bool, str, transformers.tokenization_utils_base.TruncationStrategy
+        ] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Union[str, transformers.utils.generic.TensorType] = None,
+        **kwargs,
+    ) -> transformers.tokenization_utils_base.BatchEncoding:
+        # custom html_strings preprocessing
+        if html_strings is not None:
+            if isinstance(html_strings, list):
+                html_strings = [self._preprocess(hs) for hs in html_strings]
+            elif isinstance(html_strings, str):
+                html_strings = self._preprocess(html_strings)
+        # invoke the parent method
+        return super().__call__(
+            html_strings,
+            nodes,
+            xpaths,
+            node_labels,
+            questions,
+            add_special_tokens,
+            padding,
+            truncation,
+            max_length,
+            stride,
+            pad_to_multiple_of,
+            return_token_type_ids,
+            return_attention_mask,
+            return_overflowing_tokens,
+            return_special_tokens_mask,
+            return_offsets_mapping,
+            return_length,
+            verbose,
+            return_tensors,
+            **kwargs,
+        )

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processor.MarkupLMPhishProcessor"
+  },
+  "processor_class": "MarkupLMPhishProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,299 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50265": {
+      "content": "[empty-title]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "<end-of-node>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "auto_map": {
+    "AutoProcessor": "processor.MarkupLMPhishProcessor"
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "max_depth": 50,
+  "max_width": 1000,
+  "model_max_length": 512,
+  "only_label_first_subword": true,
+  "pad_token": "<pad>",
+  "pad_token_label": -100,
+  "pad_width": 1001,
+  "processor_class": "MarkupLMPhishProcessor",
+  "sep_token": "</s>",
+  "tags_dict": {
+    "a": 0,
+    "abbr": 1,
+    "acronym": 2,
+    "address": 3,
+    "altGlyph": 4,
+    "altGlyphDef": 5,
+    "altGlyphItem": 6,
+    "animate": 7,
+    "animateColor": 8,
+    "animateMotion": 9,
+    "animateTransform": 10,
+    "applet": 11,
+    "area": 12,
+    "article": 13,
+    "aside": 14,
+    "audio": 15,
+    "b": 16,
+    "base": 17,
+    "basefont": 18,
+    "bdi": 19,
+    "bdo": 20,
+    "bgsound": 21,
+    "big": 22,
+    "blink": 23,
+    "blockquote": 24,
+    "body": 25,
+    "br": 26,
+    "button": 27,
+    "canvas": 28,
+    "caption": 29,
+    "center": 30,
+    "circle": 31,
+    "cite": 32,
+    "clipPath": 33,
+    "code": 34,
+    "col": 35,
+    "colgroup": 36,
+    "color-profile": 37,
+    "content": 38,
+    "cursor": 39,
+    "data": 40,
+    "datalist": 41,
+    "dd": 42,
+    "defs": 43,
+    "del": 44,
+    "desc": 45,
+    "details": 46,
+    "dfn": 47,
+    "dialog": 48,
+    "dir": 49,
+    "div": 50,
+    "dl": 51,
+    "dt": 52,
+    "ellipse": 53,
+    "em": 54,
+    "embed": 55,
+    "feBlend": 56,
+    "feColorMatrix": 57,
+    "feComponentTransfer": 58,
+    "feComposite": 59,
+    "feConvolveMatrix": 60,
+    "feDiffuseLighting": 61,
+    "feDisplacementMap": 62,
+    "feDistantLight": 63,
+    "feFlood": 64,
+    "feFuncA": 65,
+    "feFuncB": 66,
+    "feFuncG": 67,
+    "feFuncR": 68,
+    "feGaussianBlur": 69,
+    "feImage": 70,
+    "feMerge": 71,
+    "feMergeNode": 72,
+    "feMorphology": 73,
+    "feOffset": 74,
+    "fePointLight": 75,
+    "feSpecularLighting": 76,
+    "feSpotLight": 77,
+    "feTile": 78,
+    "feTurbulence": 79,
+    "fieldset": 80,
+    "figcaption": 81,
+    "figure": 82,
+    "filter": 83,
+    "font": 89,
+    "font-face": 88,
+    "font-face-format": 84,
+    "font-face-name": 85,
+    "font-face-src": 86,
+    "font-face-uri": 87,
+    "footer": 90,
+    "foreignObject": 91,
+    "form": 92,
+    "frame": 93,
+    "frameset": 94,
+    "g": 95,
+    "glyph": 96,
+    "glyphRef": 97,
+    "h1": 98,
+    "h2": 99,
+    "h3": 100,
+    "h4": 101,
+    "h5": 102,
+    "h6": 103,
+    "head": 104,
+    "header": 105,
+    "hgroup": 106,
+    "hkern": 107,
+    "hr": 108,
+    "html": 109,
+    "i": 110,
+    "iframe": 111,
+    "image": 112,
+    "img": 113,
+    "input": 114,
+    "ins": 115,
+    "kbd": 116,
+    "keygen": 117,
+    "label": 118,
+    "legend": 119,
+    "li": 120,
+    "line": 121,
+    "linearGradient": 122,
+    "link": 123,
+    "main": 124,
+    "map": 125,
+    "mark": 126,
+    "marker": 127,
+    "marquee": 128,
+    "mask": 129,
+    "math": 130,
+    "menu": 131,
+    "menuitem": 132,
+    "meta": 133,
+    "metadata": 134,
+    "meter": 135,
+    "missing-glyph": 136,
+    "mpath": 137,
+    "nav": 138,
+    "nobr": 139,
+    "noembed": 140,
+    "noframes": 141,
+    "noscript": 142,
+    "object": 143,
+    "ol": 144,
+    "optgroup": 145,
+    "option": 146,
+    "output": 147,
+    "p": 148,
+    "param": 149,
+    "path": 150,
+    "pattern": 151,
+    "picture": 152,
+    "plaintext": 153,
+    "polygon": 154,
+    "polyline": 155,
+    "portal": 156,
+    "pre": 157,
+    "progress": 158,
+    "q": 159,
+    "radialGradient": 160,
+    "rb": 161,
+    "rect": 162,
+    "rp": 163,
+    "rt": 164,
+    "rtc": 165,
+    "ruby": 166,
+    "s": 167,
+    "samp": 168,
+    "script": 169,
+    "section": 170,
+    "select": 171,
+    "set": 172,
+    "shadow": 173,
+    "slot": 174,
+    "small": 175,
+    "source": 176,
+    "spacer": 177,
+    "span": 178,
+    "stop": 179,
+    "strike": 180,
+    "strong": 181,
+    "style": 182,
+    "sub": 183,
+    "summary": 184,
+    "sup": 185,
+    "svg": 186,
+    "switch": 187,
+    "symbol": 188,
+    "table": 189,
+    "tbody": 190,
+    "td": 191,
+    "template": 192,
+    "text": 193,
+    "textPath": 194,
+    "textarea": 195,
+    "tfoot": 196,
+    "th": 197,
+    "thead": 198,
+    "time": 199,
+    "title": 200,
+    "tr": 201,
+    "track": 202,
+    "tref": 203,
+    "tspan": 204,
+    "tt": 205,
+    "u": 206,
+    "ul": 207,
+    "use": 208,
+    "var": 209,
+    "video": 210,
+    "view": 211,
+    "vkern": 212,
+    "wbr": 213,
+    "xmp": 214
+  },
+  "tokenizer_class": "MarkupLMTokenizer",
+  "trim_offsets": false,
+  "unk_token": "<unk>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff