|
|
|
import pytest |
|
import logging |
|
import bs4 |
|
from bs4 import BeautifulSoup |
|
from bs4.dammit import ( |
|
EntitySubstitution, |
|
EncodingDetector, |
|
UnicodeDammit, |
|
) |
|
|
|
class TestUnicodeDammit(object): |
|
"""Standalone tests of UnicodeDammit.""" |
|
|
|
def test_unicode_input(self): |
|
markup = "I'm already Unicode! \N{SNOWMAN}" |
|
dammit = UnicodeDammit(markup) |
|
assert dammit.unicode_markup == markup |
|
|
|
@pytest.mark.parametrize( |
|
"smart_quotes_to,expect_converted", |
|
[(None, "\u2018\u2019\u201c\u201d"), |
|
("xml", "‘’“”"), |
|
("html", "‘’“”"), |
|
("ascii", "''" + '""'), |
|
] |
|
) |
|
def test_smart_quotes_to(self, smart_quotes_to, expect_converted): |
|
"""Verify the functionality of the smart_quotes_to argument |
|
to the UnicodeDammit constructor.""" |
|
markup = b"<foo>\x91\x92\x93\x94</foo>" |
|
converted = UnicodeDammit( |
|
markup, known_definite_encodings=["windows-1252"], |
|
smart_quotes_to=smart_quotes_to |
|
).unicode_markup |
|
assert converted == "<foo>{}</foo>".format(expect_converted) |
|
|
|
def test_detect_utf8(self): |
|
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" |
|
dammit = UnicodeDammit(utf8) |
|
assert dammit.original_encoding.lower() == 'utf-8' |
|
assert dammit.unicode_markup == 'Sacr\xe9 bleu! \N{SNOWMAN}' |
|
|
|
def test_convert_hebrew(self): |
|
hebrew = b"\xed\xe5\xec\xf9" |
|
dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) |
|
assert dammit.original_encoding.lower() == 'iso-8859-8' |
|
assert dammit.unicode_markup == '\u05dd\u05d5\u05dc\u05e9' |
|
|
|
def test_dont_see_smart_quotes_where_there_are_none(self): |
|
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |
|
dammit = UnicodeDammit(utf_8) |
|
assert dammit.original_encoding.lower() == 'utf-8' |
|
assert dammit.unicode_markup.encode("utf-8") == utf_8 |
|
|
|
def test_ignore_inappropriate_codecs(self): |
|
utf8_data = "Räksmörgås".encode("utf-8") |
|
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) |
|
assert dammit.original_encoding.lower() == 'utf-8' |
|
|
|
def test_ignore_invalid_codecs(self): |
|
utf8_data = "Räksmörgås".encode("utf-8") |
|
for bad_encoding in ['.utf8', '...', 'utF---16.!']: |
|
dammit = UnicodeDammit(utf8_data, [bad_encoding]) |
|
assert dammit.original_encoding.lower() == 'utf-8' |
|
|
|
def test_exclude_encodings(self): |
|
|
|
utf8_data = "Räksmörgås".encode("utf-8") |
|
|
|
|
|
|
|
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) |
|
assert dammit.original_encoding.lower() == 'windows-1252' |
|
|
|
|
|
dammit = UnicodeDammit( |
|
utf8_data, exclude_encodings=["utf-8", "windows-1252"]) |
|
assert dammit.original_encoding == None |
|
|
|
class TestEncodingDetector(object): |
|
|
|
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): |
|
detected = EncodingDetector( |
|
b'<?xml version="1.0" encoding="UTF-\xdb" ?>') |
|
encodings = list(detected.encodings) |
|
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings |
|
|
|
def test_detect_html5_style_meta_tag(self): |
|
|
|
for data in ( |
|
b'<html><meta charset="euc-jp" /></html>', |
|
b"<html><meta charset='euc-jp' /></html>", |
|
b"<html><meta charset=euc-jp /></html>", |
|
b"<html><meta charset=euc-jp/></html>"): |
|
dammit = UnicodeDammit(data, is_html=True) |
|
assert "euc-jp" == dammit.original_encoding |
|
|
|
def test_last_ditch_entity_replacement(self): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> |
|
<html><b>\330\250\330\252\330\261</b> |
|
<i>\310\322\321\220\312\321\355\344</i></html>""" |
|
chardet = bs4.dammit.chardet_dammit |
|
logging.disable(logging.WARNING) |
|
try: |
|
def noop(str): |
|
return None |
|
bs4.dammit.chardet_dammit = noop |
|
dammit = UnicodeDammit(doc) |
|
assert True == dammit.contains_replacement_characters |
|
assert "\ufffd" in dammit.unicode_markup |
|
|
|
soup = BeautifulSoup(doc, "html.parser") |
|
assert soup.contains_replacement_characters |
|
finally: |
|
logging.disable(logging.NOTSET) |
|
bs4.dammit.chardet_dammit = chardet |
|
|
|
def test_byte_order_mark_removed(self): |
|
|
|
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' |
|
dammit = UnicodeDammit(data) |
|
assert "<a>áé</a>" == dammit.unicode_markup |
|
assert "utf-16le" == dammit.original_encoding |
|
|
|
def test_known_definite_versus_user_encodings(self): |
|
|
|
|
|
|
|
|
|
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' |
|
dammit = UnicodeDammit(data) |
|
|
|
|
|
|
|
before = UnicodeDammit(data, known_definite_encodings=["utf-16"]) |
|
assert "utf-16" == before.original_encoding |
|
|
|
|
|
|
|
|
|
after = UnicodeDammit(data, user_encodings=["utf-8"]) |
|
assert "utf-16le" == after.original_encoding |
|
assert ["utf-16le"] == [x[0] for x in dammit.tried_encodings] |
|
|
|
|
|
hebrew = b"\xed\xe5\xec\xf9" |
|
dammit = UnicodeDammit(hebrew, known_definite_encodings=["utf-8"], |
|
user_encodings=["iso-8859-8"]) |
|
|
|
|
|
|
|
|
|
assert "iso-8859-8" == dammit.original_encoding |
|
assert ["utf-8", "iso-8859-8"] == [x[0] for x in dammit.tried_encodings] |
|
|
|
def test_deprecated_override_encodings(self): |
|
|
|
|
|
hebrew = b"\xed\xe5\xec\xf9" |
|
dammit = UnicodeDammit( |
|
hebrew, |
|
known_definite_encodings=["shift-jis"], |
|
override_encodings=["utf-8"], |
|
user_encodings=["iso-8859-8"], |
|
) |
|
assert "iso-8859-8" == dammit.original_encoding |
|
|
|
|
|
|
|
assert ["shift-jis", "utf-8", "iso-8859-8"] == ( |
|
[x[0] for x in dammit.tried_encodings] |
|
) |
|
|
|
def test_detwingle(self): |
|
|
|
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") |
|
|
|
|
|
windows_1252 = ( |
|
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" |
|
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") |
|
|
|
|
|
doc = utf8 + windows_1252 + utf8 |
|
|
|
|
|
with pytest.raises(UnicodeDecodeError): |
|
doc.decode("utf8") |
|
|
|
|
|
|
|
|
|
|
|
fixed = UnicodeDammit.detwingle(doc) |
|
assert "☃☃☃“Hi, I like Windows!”☃☃☃" == fixed.decode("utf8") |
|
|
|
def test_detwingle_ignores_multibyte_characters(self): |
|
|
|
|
|
|
|
|
|
for tricky_unicode_char in ( |
|
"\N{LATIN SMALL LIGATURE OE}", |
|
"\N{LATIN SUBSCRIPT SMALL LETTER X}", |
|
"\xf0\x90\x90\x93", |
|
): |
|
input = tricky_unicode_char.encode("utf8") |
|
assert input.endswith(b'\x93') |
|
output = UnicodeDammit.detwingle(input) |
|
assert output == input |
|
|
|
def test_find_declared_encoding(self): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
html_unicode = '<html><head><meta charset="utf-8"></head></html>' |
|
html_bytes = html_unicode.encode("ascii") |
|
|
|
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>' |
|
xml_bytes = xml_unicode.encode("ascii") |
|
|
|
m = EncodingDetector.find_declared_encoding |
|
assert m(html_unicode, is_html=False) is None |
|
assert "utf-8" == m(html_unicode, is_html=True) |
|
assert "utf-8" == m(html_bytes, is_html=True) |
|
|
|
assert "iso-8859-1" == m(xml_unicode) |
|
assert "iso-8859-1" == m(xml_bytes) |
|
|
|
|
|
|
|
spacer = b' ' * 5000 |
|
assert m(spacer + html_bytes) is None |
|
assert m(spacer + xml_bytes) is None |
|
|
|
|
|
|
|
assert ( |
|
m(spacer + html_bytes, is_html=True, search_entire_document=True) |
|
== "utf-8" |
|
) |
|
|
|
|
|
|
|
|
|
assert m(xml_bytes, search_entire_document=True) == "iso-8859-1" |
|
assert m(b' ' + xml_bytes, search_entire_document=True) == "iso-8859-1" |
|
assert m(b'a' + xml_bytes, search_entire_document=True) is None |
|
|
|
|
|
class TestEntitySubstitution(object): |
|
"""Standalone tests of the EntitySubstitution class.""" |
|
def setup_method(self): |
|
self.sub = EntitySubstitution |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"original,substituted", |
|
[ |
|
|
|
|
|
("foo\u2200\N{SNOWMAN}\u00f5bar", |
|
"foo∀\N{SNOWMAN}õbar"), |
|
|
|
|
|
|
|
('‘’foo“”', "‘’foo“”"), |
|
] |
|
) |
|
def test_substitute_html(self, original, substituted): |
|
assert self.sub.substitute_html(original) == substituted |
|
|
|
def test_html5_entity(self): |
|
for entity, u in ( |
|
|
|
|
|
|
|
('⊧', '\u22a7'), |
|
('𝔑', '\U0001d511'), |
|
('≧̸', '\u2267\u0338'), |
|
('¬', '\xac'), |
|
('⫬', '\u2aec'), |
|
|
|
|
|
|
|
('|' '|'), |
|
|
|
|
|
|
|
("fj", "fj"), |
|
|
|
|
|
|
|
('>', '>'), |
|
('<', '<'), |
|
('&', '&'), |
|
): |
|
template = '3 %s 4' |
|
raw = template % u |
|
with_entities = template % entity |
|
assert self.sub.substitute_html(raw) == with_entities |
|
|
|
def test_html5_entity_with_variation_selector(self): |
|
|
|
|
|
|
|
data = "fjords \u2294 penguins" |
|
markup = "fjords ⊔ penguins" |
|
assert self.sub.substitute_html(data) == markup |
|
|
|
data = "fjords \u2294\ufe00 penguins" |
|
markup = "fjords ⊔︀ penguins" |
|
assert self.sub.substitute_html(data) == markup |
|
|
|
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): |
|
s = 'Welcome to "my bar"' |
|
assert self.sub.substitute_xml(s, False) == s |
|
|
|
def test_xml_attribute_quoting_normally_uses_double_quotes(self): |
|
assert self.sub.substitute_xml("Welcome", True) == '"Welcome"' |
|
assert self.sub.substitute_xml("Bob's Bar", True) == '"Bob\'s Bar"' |
|
|
|
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): |
|
s = 'Welcome to "my bar"' |
|
assert self.sub.substitute_xml(s, True) == "'Welcome to \"my bar\"'" |
|
|
|
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): |
|
s = 'Welcome to "Bob\'s Bar"' |
|
assert self.sub.substitute_xml(s, True) == '"Welcome to "Bob\'s Bar""' |
|
|
|
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): |
|
quoted = 'Welcome to "Bob\'s Bar"' |
|
assert self.sub.substitute_xml(quoted) == quoted |
|
|
|
def test_xml_quoting_handles_angle_brackets(self): |
|
assert self.sub.substitute_xml("foo<bar>") == "foo<bar>" |
|
|
|
def test_xml_quoting_handles_ampersands(self): |
|
assert self.sub.substitute_xml("AT&T") == "AT&T" |
|
|
|
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): |
|
assert self.sub.substitute_xml("ÁT&T") == "&Aacute;T&T" |
|
|
|
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): |
|
assert self.sub.substitute_xml_containing_entities("ÁT&T") == "ÁT&T" |
|
|
|
def test_quotes_not_html_substituted(self): |
|
"""There's no need to do this except inside attribute values.""" |
|
text = 'Bob\'s "bar"' |
|
assert self.sub.substitute_html(text) == text |
|
|