Spaces:
Sleeping
Sleeping
File size: 10,579 Bytes
69d9940 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 |
# coding: utf-8
"""
webencodings
~~~~~~~~~~~~
This is a Python implementation of the `WHATWG Encoding standard
<http://encoding.spec.whatwg.org/>`. See README for details.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
from __future__ import unicode_literals
import codecs
from .labels import LABELS
VERSION = '0.5.1'
# Some names in Encoding are not valid Python aliases. Remap these.
PYTHON_NAMES = {
'iso-8859-8-i': 'iso-8859-8',
'x-mac-cyrillic': 'mac-cyrillic',
'macintosh': 'mac-roman',
'windows-874': 'cp874'}
CACHE = {}
def ascii_lower(string):
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
:param string: An Unicode string.
:returns: A new Unicode string.
This is used for `ASCII case-insensitive
<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
matching of encoding labels.
The same matching is also used, among other things,
for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
This is different from the :meth:`~py:str.lower` method of Unicode strings
which also affect non-ASCII characters,
sometimes mapping them into the ASCII range:
>>> keyword = u'Bac\N{KELVIN SIGN}ground'
>>> assert keyword.lower() == u'background'
>>> assert ascii_lower(keyword) != keyword.lower()
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
"""
# This turns out to be faster than unicode.translate()
return string.encode('utf8').lower().decode('utf8')
def lookup(label):
"""
Look for an encoding by its label.
This is the spec’s `get an encoding
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
Supported labels are listed there.
:param label: A string.
:returns:
An :class:`Encoding` object, or :obj:`None` for an unknown label.
"""
# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
label = ascii_lower(label.strip('\t\n\f\r '))
name = LABELS.get(label)
if name is None:
return None
encoding = CACHE.get(name)
if encoding is None:
if name == 'x-user-defined':
from .x_user_defined import codec_info
else:
python_name = PYTHON_NAMES.get(name, name)
# Any python_name value that gets to here should be valid.
codec_info = codecs.lookup(python_name)
encoding = Encoding(name, codec_info)
CACHE[name] = encoding
return encoding
def _get_encoding(encoding_or_label):
"""
Accept either an encoding object or label.
:param encoding: An :class:`Encoding` object or a label string.
:returns: An :class:`Encoding` object.
:raises: :exc:`~exceptions.LookupError` for an unknown label.
"""
if hasattr(encoding_or_label, 'codec_info'):
return encoding_or_label
encoding = lookup(encoding_or_label)
if encoding is None:
raise LookupError('Unknown encoding label: %r' % encoding_or_label)
return encoding
class Encoding(object):
"""Reresents a character encoding such as UTF-8,
that can be used for decoding or encoding.
.. attribute:: name
Canonical name of the encoding
.. attribute:: codec_info
The actual implementation of the encoding,
a stdlib :class:`~codecs.CodecInfo` object.
See :func:`codecs.register`.
"""
def __init__(self, name, codec_info):
self.name = name
self.codec_info = codec_info
def __repr__(self):
return '<Encoding %s>' % self.name
#: The UTF-8 encoding. Should be used for new content and formats.
UTF8 = lookup('utf-8')
_UTF16LE = lookup('utf-16le')
_UTF16BE = lookup('utf-16be')
def decode(input, fallback_encoding, errors='replace'):
"""
Decode a single string.
:param input: A byte string
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:return:
A ``(output, encoding)`` tuple of an Unicode string
and an :obj:`Encoding`.
"""
# Fail early if `encoding` is an invalid label.
fallback_encoding = _get_encoding(fallback_encoding)
bom_encoding, input = _detect_bom(input)
encoding = bom_encoding or fallback_encoding
return encoding.codec_info.decode(input, errors)[0], encoding
def _detect_bom(input):
"""Return (bom_encoding, input), with any BOM removed from the input."""
if input.startswith(b'\xFF\xFE'):
return _UTF16LE, input[2:]
if input.startswith(b'\xFE\xFF'):
return _UTF16BE, input[2:]
if input.startswith(b'\xEF\xBB\xBF'):
return UTF8, input[3:]
return None, input
def encode(input, encoding=UTF8, errors='strict'):
"""
Encode a single string.
:param input: An Unicode string.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:return: A byte string.
"""
return _get_encoding(encoding).codec_info.encode(input, errors)[0]
def iter_decode(input, fallback_encoding, errors='replace'):
"""
"Pull"-based decoder.
:param input:
An iterable of byte strings.
The input is first consumed just enough to determine the encoding
based on the precense of a BOM,
then consumed on demand when the return value is.
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns:
An ``(output, encoding)`` tuple.
:obj:`output` is an iterable of Unicode strings,
:obj:`encoding` is the :obj:`Encoding` that is being used.
"""
decoder = IncrementalDecoder(fallback_encoding, errors)
generator = _iter_decode_generator(input, decoder)
encoding = next(generator)
return generator, encoding
def _iter_decode_generator(input, decoder):
"""Return a generator that first yields the :obj:`Encoding`,
then yields output chukns as Unicode strings.
"""
decode = decoder.decode
input = iter(input)
for chunck in input:
output = decode(chunck)
if output:
assert decoder.encoding is not None
yield decoder.encoding
yield output
break
else:
# Input exhausted without determining the encoding
output = decode(b'', final=True)
assert decoder.encoding is not None
yield decoder.encoding
if output:
yield output
return
for chunck in input:
output = decode(chunck)
if output:
yield output
output = decode(b'', final=True)
if output:
yield output
def iter_encode(input, encoding=UTF8, errors='strict'):
"""
“Pull”-based encoder.
:param input: An iterable of Unicode strings.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns: An iterable of byte strings.
"""
# Fail early if `encoding` is an invalid label.
encode = IncrementalEncoder(encoding, errors).encode
return _iter_encode_generator(input, encode)
def _iter_encode_generator(input, encode):
for chunck in input:
output = encode(chunck)
if output:
yield output
output = encode('', final=True)
if output:
yield output
class IncrementalDecoder(object):
"""
“Push”-based decoder.
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
"""
def __init__(self, fallback_encoding, errors='replace'):
# Fail early if `encoding` is an invalid label.
self._fallback_encoding = _get_encoding(fallback_encoding)
self._errors = errors
self._buffer = b''
self._decoder = None
#: The actual :class:`Encoding` that is being used,
#: or :obj:`None` if that is not determined yet.
#: (Ie. if there is not enough input yet to determine
#: if there is a BOM.)
self.encoding = None # Not known yet.
def decode(self, input, final=False):
"""Decode one chunk of the input.
:param input: A byte string.
:param final:
Indicate that no more input is available.
Must be :obj:`True` if this is the last call.
:returns: An Unicode string.
"""
decoder = self._decoder
if decoder is not None:
return decoder(input, final)
input = self._buffer + input
encoding, input = _detect_bom(input)
if encoding is None:
if len(input) < 3 and not final: # Not enough data yet.
self._buffer = input
return ''
else: # No BOM
encoding = self._fallback_encoding
decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
self._decoder = decoder
self.encoding = encoding
return decoder(input, final)
class IncrementalEncoder(object):
"""
“Push”-based encoder.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
.. method:: encode(input, final=False)
:param input: An Unicode string.
:param final:
Indicate that no more input is available.
Must be :obj:`True` if this is the last call.
:returns: A byte string.
"""
def __init__(self, encoding=UTF8, errors='strict'):
encoding = _get_encoding(encoding)
self.encode = encoding.codec_info.incrementalencoder(errors).encode
|