File size: 3,235 Bytes
787e5bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from typing import Optional, Union

import transformers
from bs4 import BeautifulSoup


class MarkupLMPhishProcessor(transformers.MarkupLMProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.keep_tags_ctx = [
            "html",
            "head",
            "body",
            "h1",
            "h2",
            "h3",
            "h4",
            "h5",
            "h6",
            "p",
            "a",
            "button",
            "span",
            "div",
            "iframe",
            "table",
        ]

    def _preprocess(self, html_string: str):
        # Most webpages are huge. BERT's "attention" is limited to 512 tokens.
        # In order to give the model more context to work with, we strip extraneous
        # tags/content from the page to help with the binary classification task.
        soup = BeautifulSoup(html_string, "html.parser")
        for tag in soup.find_all(True):
            if tag.name in ("style", "script"):
                # keep the meaning of the tag, but remove its contents to save space
                tag.string = ""
            elif tag.name not in self.keep_tags_ctx:
                # remove tag, but keep its contents
                tag.unwrap()
        return str(soup)

    def __call__(
        self,
        html_strings=None,
        nodes=None,
        xpaths=None,
        node_labels=None,
        questions=None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = False,
        truncation: Union[
            bool, str, transformers.tokenization_utils_base.TruncationStrategy
        ] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Union[str, transformers.utils.generic.TensorType] = None,
        **kwargs,
    ) -> transformers.tokenization_utils_base.BatchEncoding:
        # custom html_strings preprocessing
        if html_strings is not None:
            if isinstance(html_strings, list):
                html_strings = [self._preprocess(hs) for hs in html_strings]
            elif isinstance(html_strings, str):
                html_strings = self._preprocess(html_strings)
        # invoke the parent method
        return super().__call__(
            html_strings,
            nodes,
            xpaths,
            node_labels,
            questions,
            add_special_tokens,
            padding,
            truncation,
            max_length,
            stride,
            pad_to_multiple_of,
            return_token_type_ids,
            return_attention_mask,
            return_overflowing_tokens,
            return_special_tokens_mask,
            return_offsets_mapping,
            return_length,
            verbose,
            return_tensors,
            **kwargs,
        )