Spaces:
Paused
Paused
| """Tests to ensure that the html.parser tree builder generates good | |
| trees.""" | |
| from pdb import set_trace | |
| import pickle | |
| import pytest | |
| import warnings | |
| from bs4.builder import ( | |
| HTMLParserTreeBuilder, | |
| ParserRejectedMarkup, | |
| XMLParsedAsHTMLWarning, | |
| ) | |
| from bs4.builder._htmlparser import BeautifulSoupHTMLParser | |
| from . import SoupTest, HTMLTreeBuilderSmokeTest | |
| class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): | |
| default_builder = HTMLParserTreeBuilder | |
| def test_rejected_input(self): | |
| # Python's html.parser will occasionally reject markup, | |
| # especially when there is a problem with the initial DOCTYPE | |
| # declaration. Different versions of Python sound the alarm in | |
| # different ways, but Beautiful Soup consistently raises | |
| # errors as ParserRejectedMarkup exceptions. | |
| bad_markup = [ | |
| # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873 | |
| # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700 | |
| # https://github.com/python/cpython/issues/81928 | |
| b'\n<![\xff\xfe\xfe\xcd\x00', | |
| #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8 | |
| # https://github.com/python/cpython/issues/78661 | |
| # | |
| b'<![n\x00', | |
| b"<![UNKNOWN[]]>", | |
| ] | |
| for markup in bad_markup: | |
| with pytest.raises(ParserRejectedMarkup): | |
| soup = self.soup(markup) | |
| def test_namespaced_system_doctype(self): | |
| # html.parser can't handle namespaced doctypes, so skip this one. | |
| pass | |
| def test_namespaced_public_doctype(self): | |
| # html.parser can't handle namespaced doctypes, so skip this one. | |
| pass | |
| def test_builder_is_pickled(self): | |
| """Unlike most tree builders, HTMLParserTreeBuilder and will | |
| be restored after pickling. | |
| """ | |
| tree = self.soup("<a><b>foo</a>") | |
| dumped = pickle.dumps(tree, 2) | |
| loaded = pickle.loads(dumped) | |
| assert isinstance(loaded.builder, type(tree.builder)) | |
| def test_redundant_empty_element_closing_tags(self): | |
| self.assert_soup('<br></br><br></br><br></br>', "<br/><br/><br/>") | |
| self.assert_soup('</br></br></br>', "") | |
| def test_empty_element(self): | |
| # This verifies that any buffered data present when the parser | |
| # finishes working is handled. | |
| self.assert_soup("foo &# bar", "foo &# bar") | |
| def test_tracking_line_numbers(self): | |
| # The html.parser TreeBuilder keeps track of line number and | |
| # position of each element. | |
| markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>" | |
| soup = self.soup(markup) | |
| assert 2 == soup.p.sourceline | |
| assert 3 == soup.p.sourcepos | |
| assert "sourceline" == soup.p.find('sourceline').name | |
| # You can deactivate this behavior. | |
| soup = self.soup(markup, store_line_numbers=False) | |
| assert "sourceline" == soup.p.sourceline.name | |
| assert "sourcepos" == soup.p.sourcepos.name | |
| def test_on_duplicate_attribute(self): | |
| # The html.parser tree builder has a variety of ways of | |
| # handling a tag that contains the same attribute multiple times. | |
| markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">' | |
| # If you don't provide any particular value for | |
| # on_duplicate_attribute, later values replace earlier values. | |
| soup = self.soup(markup) | |
| assert "url3" == soup.a['href'] | |
| assert ["cls"] == soup.a['class'] | |
| assert "id" == soup.a['id'] | |
| # You can also get this behavior explicitly. | |
| def assert_attribute(on_duplicate_attribute, expected): | |
| soup = self.soup( | |
| markup, on_duplicate_attribute=on_duplicate_attribute | |
| ) | |
| assert expected == soup.a['href'] | |
| # Verify that non-duplicate attributes are treated normally. | |
| assert ["cls"] == soup.a['class'] | |
| assert "id" == soup.a['id'] | |
| assert_attribute(None, "url3") | |
| assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") | |
| # You can ignore subsequent values in favor of the first. | |
| assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1") | |
| # And you can pass in a callable that does whatever you want. | |
| def accumulate(attrs, key, value): | |
| if not isinstance(attrs[key], list): | |
| attrs[key] = [attrs[key]] | |
| attrs[key].append(value) | |
| assert_attribute(accumulate, ["url1", "url2", "url3"]) | |
| def test_html5_attributes(self): | |
| # The html.parser TreeBuilder can convert any entity named in | |
| # the HTML5 spec to a sequence of Unicode characters, and | |
| # convert those Unicode characters to a (potentially | |
| # different) named entity on the way out. | |
| for input_element, output_unicode, output_element in ( | |
| ("⇄", '\u21c4', b'⇄'), | |
| ('⊧', '\u22a7', b'⊧'), | |
| ('𝔑', '\U0001d511', b'𝔑'), | |
| ('≧̸', '\u2267\u0338', b'≧̸'), | |
| ('¬', '\xac', b'¬'), | |
| ('⫬', '\u2aec', b'⫬'), | |
| ('"', '"', b'"'), | |
| ('∴', '\u2234', b'∴'), | |
| ('∴', '\u2234', b'∴'), | |
| ('∴', '\u2234', b'∴'), | |
| ("fj", 'fj', b'fj'), | |
| ("⊔", '\u2294', b'⊔'), | |
| ("⊔︀", '\u2294\ufe00', b'⊔︀'), | |
| ("'", "'", b"'"), | |
| ("|", "|", b"|"), | |
| ): | |
| markup = '<div>%s</div>' % input_element | |
| div = self.soup(markup).div | |
| without_element = div.encode() | |
| expect = b"<div>%s</div>" % output_unicode.encode("utf8") | |
| assert without_element == expect | |
| with_element = div.encode(formatter="html") | |
| expect = b"<div>%s</div>" % output_element | |
| assert with_element == expect | |