Spaces:
Paused
Paused
| """This file contains test cases reported by third parties using | |
| fuzzing tools, primarily from Google's oss-fuzz project. Some of these | |
| represent real problems with Beautiful Soup, but many are problems in | |
| libraries that Beautiful Soup depends on, and many of the test cases | |
| represent different ways of triggering the same problem. | |
| Grouping these test cases together makes it easy to see which test | |
| cases represent the same problem, and puts the test cases in close | |
| proximity to code that can trigger the problems. | |
| """ | |
| import os | |
| import pytest | |
| from bs4 import ( | |
| BeautifulSoup, | |
| ParserRejectedMarkup, | |
| ) | |
| try: | |
| from soupsieve.util import SelectorSyntaxError | |
| import lxml | |
| import html5lib | |
| fully_fuzzable = True | |
| except ImportError: | |
| fully_fuzzable = False | |
| class TestFuzz(object): | |
| # Test case markup files from fuzzers are given this extension so | |
| # they can be included in builds. | |
| TESTCASE_SUFFIX = ".testcase" | |
| # Copied 20230512 from | |
| # https://github.com/google/oss-fuzz/blob/4ac6a645a197a695fe76532251feb5067076b3f3/projects/bs4/bs4_fuzzer.py | |
| # | |
| # Copying the code lets us precisely duplicate the behavior of | |
| # oss-fuzz. The downside is that this code changes over time, so | |
| # multiple copies of the code must be kept around to run against | |
| # older tests. I'm not sure what to do about this, but I may | |
| # retire old tests after a time. | |
| def fuzz_test_with_css(self, filename): | |
| data = self.__markup(filename) | |
| parsers = ['lxml-xml', 'html5lib', 'html.parser', 'lxml'] | |
| try: | |
| idx = int(data[0]) % len(parsers) | |
| except ValueError: | |
| return | |
| css_selector, data = data[1:10], data[10:] | |
| try: | |
| soup = BeautifulSoup(data[1:], features=parsers[idx]) | |
| except ParserRejectedMarkup: | |
| return | |
| except ValueError: | |
| return | |
| list(soup.find_all(True)) | |
| try: | |
| soup.css.select(css_selector.decode('utf-8', 'replace')) | |
| except SelectorSyntaxError: | |
| return | |
| soup.prettify() | |
| # This class of error has been fixed by catching a less helpful | |
| # exception from html.parser and raising ParserRejectedMarkup | |
| # instead. | |
| def test_rejected_markup(self, filename): | |
| markup = self.__markup(filename) | |
| with pytest.raises(ParserRejectedMarkup): | |
| BeautifulSoup(markup, 'html.parser') | |
| # This class of error has to do with very deeply nested documents | |
| # which overflow the Python call stack when the tree is converted | |
| # to a string. This is an issue with Beautiful Soup which was fixed | |
| # as part of [bug=1471755]. | |
| # | |
| # These test cases are in the older format that doesn't specify | |
| # which parser to use or give a CSS selector. | |
| def test_deeply_nested_document_without_css(self, filename): | |
| # Parsing the document and encoding it back to a string is | |
| # sufficient to demonstrate that the overflow problem has | |
| # been fixed. | |
| markup = self.__markup(filename) | |
| BeautifulSoup(markup, 'html.parser').encode() | |
| # This class of error has to do with very deeply nested documents | |
| # which overflow the Python call stack when the tree is converted | |
| # to a string. This is an issue with Beautiful Soup which was fixed | |
| # as part of [bug=1471755]. | |
| def test_deeply_nested_document(self, filename): | |
| self.fuzz_test_with_css(filename) | |
| def test_soupsieve_errors(self, filename): | |
| self.fuzz_test_with_css(filename) | |
| # This class of error represents problems with html5lib's parser, | |
| # not Beautiful Soup. I use | |
| # https://github.com/html5lib/html5lib-python/issues/568 to notify | |
| # the html5lib developers of these issues. | |
| # | |
| # These test cases are in the older format that doesn't specify | |
| # which parser to use or give a CSS selector. | |
| def test_html5lib_parse_errors_without_css(self, filename): | |
| markup = self.__markup(filename) | |
| print(BeautifulSoup(markup, 'html5lib').encode()) | |
| # This class of error represents problems with html5lib's parser, | |
| # not Beautiful Soup. I use | |
| # https://github.com/html5lib/html5lib-python/issues/568 to notify | |
| # the html5lib developers of these issues. | |
| def test_html5lib_parse_errors(self, filename): | |
| self.fuzz_test_with_css(filename) | |
| def __markup(self, filename): | |
| if not filename.endswith(self.TESTCASE_SUFFIX): | |
| filename += self.TESTCASE_SUFFIX | |
| this_dir = os.path.split(__file__)[0] | |
| path = os.path.join(this_dir, 'fuzz', filename) | |
| return open(path, 'rb').read() | |