File size: 6,972 Bytes
6370773
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""This file contains test cases reported by third parties using
fuzzing tools, primarily from Google's oss-fuzz project. Some of these
represent real problems with Beautiful Soup, but many are problems in
libraries that Beautiful Soup depends on, and many of the test cases
represent different ways of triggering the same problem.

Grouping these test cases together makes it easy to see which test
cases represent the same problem, and puts the test cases in close
proximity to code that can trigger the problems.
"""
import os
import pytest
from bs4 import (
    BeautifulSoup,
    ParserRejectedMarkup,
)
try:
    from soupsieve.util import SelectorSyntaxError
    import lxml
    import html5lib
    fully_fuzzable = True
except ImportError:
    fully_fuzzable = False
    

@pytest.mark.skipif(not fully_fuzzable, reason="Prerequisites for fuzz tests are not installed.")
class TestFuzz(object):

    # Test case markup files from fuzzers are given this extension so
    # they can be included in builds.
    TESTCASE_SUFFIX = ".testcase"

    # Copied 20230512 from
    # https://github.com/google/oss-fuzz/blob/4ac6a645a197a695fe76532251feb5067076b3f3/projects/bs4/bs4_fuzzer.py
    #
    # Copying the code lets us precisely duplicate the behavior of
    # oss-fuzz.  The downside is that this code changes over time, so
    # multiple copies of the code must be kept around to run against
    # older tests. I'm not sure what to do about this, but I may
    # retire old tests after a time.
    def fuzz_test_with_css(self, filename):
        data = self.__markup(filename)
        parsers = ['lxml-xml', 'html5lib', 'html.parser', 'lxml']
        try:
            idx = int(data[0]) % len(parsers)
        except ValueError:
            return

        css_selector, data = data[1:10], data[10:]

        try:
            soup = BeautifulSoup(data[1:], features=parsers[idx])
        except ParserRejectedMarkup:
            return
        except ValueError:
            return

        list(soup.find_all(True))
        try:
            soup.css.select(css_selector.decode('utf-8', 'replace'))
        except SelectorSyntaxError:
            return
        soup.prettify()
    
    # This class of error has been fixed by catching a less helpful
    # exception from html.parser and raising ParserRejectedMarkup
    # instead.
    @pytest.mark.parametrize(
        "filename", [
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5703933063462912",
            "crash-ffbdfa8a2b26f13537b68d3794b0478a4090ee4a",
        ]
    )
    def test_rejected_markup(self, filename):
        markup = self.__markup(filename)
        with pytest.raises(ParserRejectedMarkup):
            BeautifulSoup(markup, 'html.parser')
            
    # This class of error has to do with very deeply nested documents
    # which overflow the Python call stack when the tree is converted
    # to a string. This is an issue with Beautiful Soup which was fixed
    # as part of [bug=1471755].
    #
    # These test cases are in the older format that doesn't specify
    # which parser to use or give a CSS selector.
    @pytest.mark.parametrize(
        "filename", [
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5984173902397440",
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5167584867909632",
            "clusterfuzz-testcase-minimized-bs4_fuzzer-6124268085182464",
            "clusterfuzz-testcase-minimized-bs4_fuzzer-6450958476902400",
        ]
    )
    def test_deeply_nested_document_without_css(self, filename):
        # Parsing the document and encoding it back to a string is
        # sufficient to demonstrate that the overflow problem has
        # been fixed.
        markup = self.__markup(filename)
        BeautifulSoup(markup, 'html.parser').encode()

    # This class of error has to do with very deeply nested documents
    # which overflow the Python call stack when the tree is converted
    # to a string. This is an issue with Beautiful Soup which was fixed
    # as part of [bug=1471755].
    @pytest.mark.parametrize(
        "filename", [
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5000587759190016",
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5375146639360000",
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5492400320282624",
        ]
    )
    def test_deeply_nested_document(self, filename): 
       self.fuzz_test_with_css(filename)
        
    @pytest.mark.parametrize(
        "filename", [
            "clusterfuzz-testcase-minimized-bs4_fuzzer-4670634698080256",
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5270998950477824",
        ]
    )
    def test_soupsieve_errors(self, filename):
        self.fuzz_test_with_css(filename)
        
    # This class of error represents problems with html5lib's parser,
    # not Beautiful Soup. I use
    # https://github.com/html5lib/html5lib-python/issues/568 to notify
    # the html5lib developers of these issues.
    #
    # These test cases are in the older format that doesn't specify
    # which parser to use or give a CSS selector.
    @pytest.mark.skip(reason="html5lib-specific problems")
    @pytest.mark.parametrize(
        "filename", [
            # b"""ÿ<!DOCTyPEV PUBLIC'''Ð'"""
            "clusterfuzz-testcase-minimized-bs4_fuzzer-4818336571064320",

            # b')<a><math><TR><a><mI><a><p><a>'
            "clusterfuzz-testcase-minimized-bs4_fuzzer-4999465949331456",

            # b'-<math><sElect><mi><sElect><sElect>'
            "clusterfuzz-testcase-minimized-bs4_fuzzer-5843991618256896",
           
            # b'ñ<table><svg><html>'
            "clusterfuzz-testcase-minimized-bs4_fuzzer-6241471367348224",

            # <TABLE>, some ^@ characters, some <math> tags.
            "clusterfuzz-testcase-minimized-bs4_fuzzer-6600557255327744",

            # Nested table
            "crash-0d306a50c8ed8bcd0785b67000fcd5dea1d33f08"
        ]
    )
    def test_html5lib_parse_errors_without_css(self, filename):
        markup = self.__markup(filename)
        print(BeautifulSoup(markup, 'html5lib').encode())

    # This class of error represents problems with html5lib's parser,
    # not Beautiful Soup. I use
    # https://github.com/html5lib/html5lib-python/issues/568 to notify
    # the html5lib developers of these issues.
    @pytest.mark.skip(reason="html5lib-specific problems")
    @pytest.mark.parametrize(
        "filename", [
            # b'-      \xff\xff  <math>\x10<select><mi><select><select>t'
            "clusterfuzz-testcase-minimized-bs4_fuzzer-6306874195312640",
        ]
    )
    def test_html5lib_parse_errors(self, filename):
        self.fuzz_test_with_css(filename)
        
    def __markup(self, filename):
        if not filename.endswith(self.TESTCASE_SUFFIX):
            filename += self.TESTCASE_SUFFIX
        this_dir = os.path.split(__file__)[0]
        path = os.path.join(this_dir, 'fuzz', filename)
        return open(path, 'rb').read()