|
"""Tests to ensure that the lxml tree builder generates good trees.""" |
|
|
|
import pickle |
|
import pytest |
|
import re |
|
import warnings |
|
from . import LXML_PRESENT, LXML_VERSION |
|
|
|
if LXML_PRESENT: |
|
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML |
|
|
|
from bs4 import ( |
|
BeautifulSoup, |
|
BeautifulStoneSoup, |
|
) |
|
from bs4.element import Comment, Doctype, SoupStrainer |
|
from . import ( |
|
HTMLTreeBuilderSmokeTest, |
|
XMLTreeBuilderSmokeTest, |
|
SOUP_SIEVE_PRESENT, |
|
SoupTest, |
|
) |
|
|
|
@pytest.mark.skipif( |
|
not LXML_PRESENT, |
|
reason="lxml seems not to be present, not testing its tree builder." |
|
) |
|
class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): |
|
"""See ``HTMLTreeBuilderSmokeTest``.""" |
|
|
|
@property |
|
def default_builder(self): |
|
return LXMLTreeBuilder |
|
|
|
def test_out_of_range_entity(self): |
|
self.assert_soup( |
|
"<p>foo�bar</p>", "<p>foobar</p>") |
|
self.assert_soup( |
|
"<p>foo�bar</p>", "<p>foobar</p>") |
|
self.assert_soup( |
|
"<p>foo�bar</p>", "<p>foobar</p>") |
|
|
|
def test_entities_in_foreign_document_encoding(self): |
|
|
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif( |
|
not LXML_PRESENT or LXML_VERSION < (2,3,5,0), |
|
reason="Skipping doctype test for old version of lxml to avoid segfault." |
|
) |
|
def test_empty_doctype(self): |
|
soup = self.soup("<!DOCTYPE>") |
|
doctype = soup.contents[0] |
|
assert "" == doctype.strip() |
|
|
|
def test_beautifulstonesoup_is_xml_parser(self): |
|
|
|
|
|
with warnings.catch_warnings(record=True) as w: |
|
soup = BeautifulStoneSoup("<b />") |
|
assert "<b/>" == str(soup.b) |
|
[warning] = w |
|
assert warning.filename == __file__ |
|
assert "BeautifulStoneSoup class is deprecated" in str(warning.message) |
|
|
|
def test_tracking_line_numbers(self): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
soup = self.soup( |
|
"\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>", |
|
store_line_numbers=True |
|
) |
|
assert "sourceline" == soup.p.sourceline.name |
|
assert "sourcepos" == soup.p.sourcepos.name |
|
|
|
@pytest.mark.skipif( |
|
not LXML_PRESENT, |
|
reason="lxml seems not to be present, not testing its XML tree builder." |
|
) |
|
class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): |
|
"""See ``HTMLTreeBuilderSmokeTest``.""" |
|
|
|
@property |
|
def default_builder(self): |
|
return LXMLTreeBuilderForXML |
|
|
|
def test_namespace_indexing(self): |
|
soup = self.soup( |
|
'<?xml version="1.1"?>\n' |
|
'<root>' |
|
'<tag xmlns="http://unprefixed-namespace.com">content</tag>' |
|
'<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>' |
|
'<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">' |
|
'<subtag xmlns="http://another-unprefixed-namespace.com">' |
|
'<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">' |
|
'</prefix2:tag3>' |
|
'</root>' |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert soup._namespaces == { |
|
'xml': 'http://www.w3.org/XML/1998/namespace', |
|
'prefix': 'http://prefixed-namespace.com', |
|
'prefix2': 'http://another-namespace.com' |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert soup.tag._namespaces == { |
|
'xml': 'http://www.w3.org/XML/1998/namespace', |
|
} |
|
|
|
assert soup.tag2._namespaces == { |
|
'prefix': 'http://prefixed-namespace.com', |
|
'xml': 'http://www.w3.org/XML/1998/namespace', |
|
} |
|
|
|
assert soup.subtag._namespaces == { |
|
'prefix2': 'http://another-namespace.com', |
|
'xml': 'http://www.w3.org/XML/1998/namespace', |
|
} |
|
|
|
assert soup.subsubtag._namespaces == { |
|
'prefix2': 'http://another-namespace.com', |
|
'xml': 'http://www.w3.org/XML/1998/namespace', |
|
} |
|
|
|
|
|
@pytest.mark.skipif( |
|
not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed" |
|
) |
|
def test_namespace_interaction_with_select_and_find(self): |
|
|
|
|
|
|
|
soup = self.soup( |
|
'<?xml version="1.1"?>\n' |
|
'<root>' |
|
'<tag xmlns="http://unprefixed-namespace.com">content</tag>' |
|
'<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>' |
|
'<subtag xmlns:prefix="http://another-namespace-same-prefix.com">' |
|
'<prefix:tag3>' |
|
'</subtag>' |
|
'</root>' |
|
) |
|
|
|
|
|
assert soup.select_one('tag').name == 'tag' |
|
assert soup.select_one('prefix|tag2').name == 'tag2' |
|
|
|
|
|
|
|
assert soup.select_one('prefix|tag3') is None |
|
|
|
|
|
assert soup.select_one( |
|
'prefix|tag3', namespaces=soup.subtag._namespaces |
|
).name == 'tag3' |
|
|
|
|
|
|
|
assert soup.subtag.select_one('prefix|tag3').name=='tag3' |
|
|
|
|
|
|
|
assert soup.find('tag').name == 'tag' |
|
assert soup.find('prefix:tag2').name == 'tag2' |
|
assert soup.find('prefix:tag3').name == 'tag3' |
|
assert soup.subtag.find('prefix:tag3').name == 'tag3' |
|
|
|
def test_pickle_restores_builder(self): |
|
|
|
|
|
|
|
soup = self.soup("<a>some markup</a>") |
|
assert isinstance(soup.builder, self.default_builder) |
|
pickled = pickle.dumps(soup) |
|
unpickled = pickle.loads(pickled) |
|
|
|
assert "some markup" == unpickled.a.string |
|
assert unpickled.builder != soup.builder |
|
assert isinstance(unpickled.builder, self.default_builder) |
|
|