| """Tests to ensure that the lxml tree builder generates good trees.""" |
|
|
| import pickle |
| import pytest |
| import re |
| import warnings |
| from . import LXML_PRESENT, LXML_VERSION |
|
|
| if LXML_PRESENT: |
| from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML |
|
|
| from bs4 import ( |
| BeautifulSoup, |
| BeautifulStoneSoup, |
| ) |
| from bs4.element import Comment, Doctype, SoupStrainer |
| from . import ( |
| HTMLTreeBuilderSmokeTest, |
| XMLTreeBuilderSmokeTest, |
| SOUP_SIEVE_PRESENT, |
| SoupTest, |
| ) |
|
|
| @pytest.mark.skipif( |
| not LXML_PRESENT, |
| reason="lxml seems not to be present, not testing its tree builder." |
| ) |
| class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): |
| """See ``HTMLTreeBuilderSmokeTest``.""" |
|
|
| @property |
| def default_builder(self): |
| return LXMLTreeBuilder |
|
|
| def test_out_of_range_entity(self): |
| self.assert_soup( |
| "<p>foo�bar</p>", "<p>foobar</p>") |
| self.assert_soup( |
| "<p>foo�bar</p>", "<p>foobar</p>") |
| self.assert_soup( |
| "<p>foo�bar</p>", "<p>foobar</p>") |
| |
| def test_entities_in_foreign_document_encoding(self): |
| |
| |
| |
| pass |
| |
| |
| |
|
|
| @pytest.mark.skipif( |
| not LXML_PRESENT or LXML_VERSION < (2,3,5,0), |
| reason="Skipping doctype test for old version of lxml to avoid segfault." |
| ) |
| def test_empty_doctype(self): |
| soup = self.soup("<!DOCTYPE>") |
| doctype = soup.contents[0] |
| assert "" == doctype.strip() |
|
|
| def test_beautifulstonesoup_is_xml_parser(self): |
| |
| |
| with warnings.catch_warnings(record=True) as w: |
| soup = BeautifulStoneSoup("<b />") |
| assert "<b/>" == str(soup.b) |
| [warning] = w |
| assert warning.filename == __file__ |
| assert "BeautifulStoneSoup class is deprecated" in str(warning.message) |
|
|
| def test_tracking_line_numbers(self): |
| |
| |
| |
| |
| |
| |
| |
| soup = self.soup( |
| "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>", |
| store_line_numbers=True |
| ) |
| assert "sourceline" == soup.p.sourceline.name |
| assert "sourcepos" == soup.p.sourcepos.name |
| |
| @pytest.mark.skipif( |
| not LXML_PRESENT, |
| reason="lxml seems not to be present, not testing its XML tree builder." |
| ) |
| class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): |
| """See ``HTMLTreeBuilderSmokeTest``.""" |
|
|
| @property |
| def default_builder(self): |
| return LXMLTreeBuilderForXML |
|
|
| def test_namespace_indexing(self): |
| soup = self.soup( |
| '<?xml version="1.1"?>\n' |
| '<root>' |
| '<tag xmlns="http://unprefixed-namespace.com">content</tag>' |
| '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>' |
| '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">' |
| '<subtag xmlns="http://another-unprefixed-namespace.com">' |
| '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">' |
| '</prefix2:tag3>' |
| '</root>' |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
| assert soup._namespaces == { |
| 'xml': 'http://www.w3.org/XML/1998/namespace', |
| 'prefix': 'http://prefixed-namespace.com', |
| 'prefix2': 'http://another-namespace.com' |
| } |
|
|
| |
| |
|
|
| |
| |
| |
| |
| assert soup.tag._namespaces == { |
| 'xml': 'http://www.w3.org/XML/1998/namespace', |
| } |
|
|
| assert soup.tag2._namespaces == { |
| 'prefix': 'http://prefixed-namespace.com', |
| 'xml': 'http://www.w3.org/XML/1998/namespace', |
| } |
|
|
| assert soup.subtag._namespaces == { |
| 'prefix2': 'http://another-namespace.com', |
| 'xml': 'http://www.w3.org/XML/1998/namespace', |
| } |
|
|
| assert soup.subsubtag._namespaces == { |
| 'prefix2': 'http://another-namespace.com', |
| 'xml': 'http://www.w3.org/XML/1998/namespace', |
| } |
|
|
|
|
| @pytest.mark.skipif( |
| not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed" |
| ) |
| def test_namespace_interaction_with_select_and_find(self): |
| |
| |
| |
| soup = self.soup( |
| '<?xml version="1.1"?>\n' |
| '<root>' |
| '<tag xmlns="http://unprefixed-namespace.com">content</tag>' |
| '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>' |
| '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">' |
| '<prefix:tag3>' |
| '</subtag>' |
| '</root>' |
| ) |
|
|
| |
| assert soup.select_one('tag').name == 'tag' |
| assert soup.select_one('prefix|tag2').name == 'tag2' |
|
|
| |
| |
| assert soup.select_one('prefix|tag3') is None |
|
|
| |
| assert soup.select_one( |
| 'prefix|tag3', namespaces=soup.subtag._namespaces |
| ).name == 'tag3' |
|
|
| |
| |
| assert soup.subtag.select_one('prefix|tag3').name=='tag3' |
|
|
| |
| |
| assert soup.find('tag').name == 'tag' |
| assert soup.find('prefix:tag2').name == 'tag2' |
| assert soup.find('prefix:tag3').name == 'tag3' |
| assert soup.subtag.find('prefix:tag3').name == 'tag3' |
|
|
| def test_pickle_restores_builder(self): |
| |
| |
| |
| soup = self.soup("<a>some markup</a>") |
| assert isinstance(soup.builder, self.default_builder) |
| pickled = pickle.dumps(soup) |
| unpickled = pickle.loads(pickled) |
|
|
| assert "some markup" == unpickled.a.string |
| assert unpickled.builder != soup.builder |
| assert isinstance(unpickled.builder, self.default_builder) |
|
|