Spaces:
Sleeping
Sleeping
# coding: utf-8 | |
""" | |
webencodings | |
~~~~~~~~~~~~ | |
This is a Python implementation of the `WHATWG Encoding standard | |
<http://encoding.spec.whatwg.org/>`. See README for details. | |
:copyright: Copyright 2012 by Simon Sapin | |
:license: BSD, see LICENSE for details. | |
""" | |
from __future__ import unicode_literals | |
import codecs | |
from .labels import LABELS | |
VERSION = '0.5.1' | |
# Some names in Encoding are not valid Python aliases. Remap these. | |
PYTHON_NAMES = { | |
'iso-8859-8-i': 'iso-8859-8', | |
'x-mac-cyrillic': 'mac-cyrillic', | |
'macintosh': 'mac-roman', | |
'windows-874': 'cp874'} | |
CACHE = {} | |
def ascii_lower(string): | |
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. | |
:param string: An Unicode string. | |
:returns: A new Unicode string. | |
This is used for `ASCII case-insensitive | |
<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ | |
matching of encoding labels. | |
The same matching is also used, among other things, | |
for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_. | |
This is different from the :meth:`~py:str.lower` method of Unicode strings | |
which also affect non-ASCII characters, | |
sometimes mapping them into the ASCII range: | |
>>> keyword = u'Bac\N{KELVIN SIGN}ground' | |
>>> assert keyword.lower() == u'background' | |
>>> assert ascii_lower(keyword) != keyword.lower() | |
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' | |
""" | |
# This turns out to be faster than unicode.translate() | |
return string.encode('utf8').lower().decode('utf8') | |
def lookup(label): | |
""" | |
Look for an encoding by its label. | |
This is the spec’s `get an encoding | |
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. | |
Supported labels are listed there. | |
:param label: A string. | |
:returns: | |
An :class:`Encoding` object, or :obj:`None` for an unknown label. | |
""" | |
# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. | |
label = ascii_lower(label.strip('\t\n\f\r ')) | |
name = LABELS.get(label) | |
if name is None: | |
return None | |
encoding = CACHE.get(name) | |
if encoding is None: | |
if name == 'x-user-defined': | |
from .x_user_defined import codec_info | |
else: | |
python_name = PYTHON_NAMES.get(name, name) | |
# Any python_name value that gets to here should be valid. | |
codec_info = codecs.lookup(python_name) | |
encoding = Encoding(name, codec_info) | |
CACHE[name] = encoding | |
return encoding | |
def _get_encoding(encoding_or_label): | |
""" | |
Accept either an encoding object or label. | |
:param encoding: An :class:`Encoding` object or a label string. | |
:returns: An :class:`Encoding` object. | |
:raises: :exc:`~exceptions.LookupError` for an unknown label. | |
""" | |
if hasattr(encoding_or_label, 'codec_info'): | |
return encoding_or_label | |
encoding = lookup(encoding_or_label) | |
if encoding is None: | |
raise LookupError('Unknown encoding label: %r' % encoding_or_label) | |
return encoding | |
class Encoding(object): | |
"""Reresents a character encoding such as UTF-8, | |
that can be used for decoding or encoding. | |
.. attribute:: name | |
Canonical name of the encoding | |
.. attribute:: codec_info | |
The actual implementation of the encoding, | |
a stdlib :class:`~codecs.CodecInfo` object. | |
See :func:`codecs.register`. | |
""" | |
def __init__(self, name, codec_info): | |
self.name = name | |
self.codec_info = codec_info | |
def __repr__(self): | |
return '<Encoding %s>' % self.name | |
#: The UTF-8 encoding. Should be used for new content and formats. | |
UTF8 = lookup('utf-8') | |
_UTF16LE = lookup('utf-16le') | |
_UTF16BE = lookup('utf-16be') | |
def decode(input, fallback_encoding, errors='replace'): | |
""" | |
Decode a single string. | |
:param input: A byte string | |
:param fallback_encoding: | |
An :class:`Encoding` object or a label string. | |
The encoding to use if :obj:`input` does note have a BOM. | |
:param errors: Type of error handling. See :func:`codecs.register`. | |
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
:return: | |
A ``(output, encoding)`` tuple of an Unicode string | |
and an :obj:`Encoding`. | |
""" | |
# Fail early if `encoding` is an invalid label. | |
fallback_encoding = _get_encoding(fallback_encoding) | |
bom_encoding, input = _detect_bom(input) | |
encoding = bom_encoding or fallback_encoding | |
return encoding.codec_info.decode(input, errors)[0], encoding | |
def _detect_bom(input): | |
"""Return (bom_encoding, input), with any BOM removed from the input.""" | |
if input.startswith(b'\xFF\xFE'): | |
return _UTF16LE, input[2:] | |
if input.startswith(b'\xFE\xFF'): | |
return _UTF16BE, input[2:] | |
if input.startswith(b'\xEF\xBB\xBF'): | |
return UTF8, input[3:] | |
return None, input | |
def encode(input, encoding=UTF8, errors='strict'): | |
""" | |
Encode a single string. | |
:param input: An Unicode string. | |
:param encoding: An :class:`Encoding` object or a label string. | |
:param errors: Type of error handling. See :func:`codecs.register`. | |
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
:return: A byte string. | |
""" | |
return _get_encoding(encoding).codec_info.encode(input, errors)[0] | |
def iter_decode(input, fallback_encoding, errors='replace'): | |
""" | |
"Pull"-based decoder. | |
:param input: | |
An iterable of byte strings. | |
The input is first consumed just enough to determine the encoding | |
based on the precense of a BOM, | |
then consumed on demand when the return value is. | |
:param fallback_encoding: | |
An :class:`Encoding` object or a label string. | |
The encoding to use if :obj:`input` does note have a BOM. | |
:param errors: Type of error handling. See :func:`codecs.register`. | |
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
:returns: | |
An ``(output, encoding)`` tuple. | |
:obj:`output` is an iterable of Unicode strings, | |
:obj:`encoding` is the :obj:`Encoding` that is being used. | |
""" | |
decoder = IncrementalDecoder(fallback_encoding, errors) | |
generator = _iter_decode_generator(input, decoder) | |
encoding = next(generator) | |
return generator, encoding | |
def _iter_decode_generator(input, decoder): | |
"""Return a generator that first yields the :obj:`Encoding`, | |
then yields output chukns as Unicode strings. | |
""" | |
decode = decoder.decode | |
input = iter(input) | |
for chunck in input: | |
output = decode(chunck) | |
if output: | |
assert decoder.encoding is not None | |
yield decoder.encoding | |
yield output | |
break | |
else: | |
# Input exhausted without determining the encoding | |
output = decode(b'', final=True) | |
assert decoder.encoding is not None | |
yield decoder.encoding | |
if output: | |
yield output | |
return | |
for chunck in input: | |
output = decode(chunck) | |
if output: | |
yield output | |
output = decode(b'', final=True) | |
if output: | |
yield output | |
def iter_encode(input, encoding=UTF8, errors='strict'): | |
""" | |
“Pull”-based encoder. | |
:param input: An iterable of Unicode strings. | |
:param encoding: An :class:`Encoding` object or a label string. | |
:param errors: Type of error handling. See :func:`codecs.register`. | |
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
:returns: An iterable of byte strings. | |
""" | |
# Fail early if `encoding` is an invalid label. | |
encode = IncrementalEncoder(encoding, errors).encode | |
return _iter_encode_generator(input, encode) | |
def _iter_encode_generator(input, encode): | |
for chunck in input: | |
output = encode(chunck) | |
if output: | |
yield output | |
output = encode('', final=True) | |
if output: | |
yield output | |
class IncrementalDecoder(object): | |
""" | |
“Push”-based decoder. | |
:param fallback_encoding: | |
An :class:`Encoding` object or a label string. | |
The encoding to use if :obj:`input` does note have a BOM. | |
:param errors: Type of error handling. See :func:`codecs.register`. | |
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
""" | |
def __init__(self, fallback_encoding, errors='replace'): | |
# Fail early if `encoding` is an invalid label. | |
self._fallback_encoding = _get_encoding(fallback_encoding) | |
self._errors = errors | |
self._buffer = b'' | |
self._decoder = None | |
#: The actual :class:`Encoding` that is being used, | |
#: or :obj:`None` if that is not determined yet. | |
#: (Ie. if there is not enough input yet to determine | |
#: if there is a BOM.) | |
self.encoding = None # Not known yet. | |
def decode(self, input, final=False): | |
"""Decode one chunk of the input. | |
:param input: A byte string. | |
:param final: | |
Indicate that no more input is available. | |
Must be :obj:`True` if this is the last call. | |
:returns: An Unicode string. | |
""" | |
decoder = self._decoder | |
if decoder is not None: | |
return decoder(input, final) | |
input = self._buffer + input | |
encoding, input = _detect_bom(input) | |
if encoding is None: | |
if len(input) < 3 and not final: # Not enough data yet. | |
self._buffer = input | |
return '' | |
else: # No BOM | |
encoding = self._fallback_encoding | |
decoder = encoding.codec_info.incrementaldecoder(self._errors).decode | |
self._decoder = decoder | |
self.encoding = encoding | |
return decoder(input, final) | |
class IncrementalEncoder(object): | |
""" | |
“Push”-based encoder. | |
:param encoding: An :class:`Encoding` object or a label string. | |
:param errors: Type of error handling. See :func:`codecs.register`. | |
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
.. method:: encode(input, final=False) | |
:param input: An Unicode string. | |
:param final: | |
Indicate that no more input is available. | |
Must be :obj:`True` if this is the last call. | |
:returns: A byte string. | |
""" | |
def __init__(self, encoding=UTF8, errors='strict'): | |
encoding = _get_encoding(encoding) | |
self.encode = encoding.codec_info.incrementalencoder(errors).encode | |