Spaces:
Running
Running
File size: 4,014 Bytes
9b0f4a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import logging
import re
from typing import Dict, Iterable, Optional, cast
from pdf2zh.glyphlist import glyphname2unicode
from pdf2zh.latin_enc import ENCODING
from pdf2zh.pdfexceptions import PDFKeyError
from pdf2zh.psparser import PSLiteral
HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")
log = logging.getLogger(__name__)
def name2unicode(name: str) -> str:
"""Converts Adobe glyph names to Unicode numbers.
In contrast to the specification, this raises a KeyError instead of return
an empty string when the key is unknown.
This way the caller must explicitly define what to do
when there is not a match.
Reference:
https://github.com/adobe-type-tools/agl-specification#2-the-mapping
:returns unicode character if name resembles something,
otherwise a KeyError
"""
if not isinstance(name, str):
raise PDFKeyError(
'Could not convert unicode name "%s" to character because '
"it should be of type str but is of type %s" % (name, type(name)),
)
name = name.split(".")[0]
components = name.split("_")
if len(components) > 1:
return "".join(map(name2unicode, components))
elif name in glyphname2unicode:
return glyphname2unicode[name]
elif name.startswith("uni"):
name_without_uni = name.strip("uni")
if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
unicode_digits = [
int(name_without_uni[i : i + 4], base=16)
for i in range(0, len(name_without_uni), 4)
]
for digit in unicode_digits:
raise_key_error_for_invalid_unicode(digit)
characters = map(chr, unicode_digits)
return "".join(characters)
elif name.startswith("u"):
name_without_u = name.strip("u")
if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
unicode_digit = int(name_without_u, base=16)
raise_key_error_for_invalid_unicode(unicode_digit)
return chr(unicode_digit)
raise PDFKeyError(
'Could not convert unicode name "%s" to character because '
"it does not match specification" % name,
)
def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
"""Unicode values should not be in the range D800 through DFFF because
that is used for surrogate pairs in UTF-16
:raises KeyError if unicode digit is invalid
"""
if 55295 < unicode_digit < 57344:
raise PDFKeyError(
"Unicode digit %d is invalid because "
"it is in the range D800 through DFFF" % unicode_digit,
)
class EncodingDB:
std2unicode: Dict[int, str] = {}
mac2unicode: Dict[int, str] = {}
win2unicode: Dict[int, str] = {}
pdf2unicode: Dict[int, str] = {}
for name, std, mac, win, pdf in ENCODING:
c = name2unicode(name)
if std:
std2unicode[std] = c
if mac:
mac2unicode[mac] = c
if win:
win2unicode[win] = c
if pdf:
pdf2unicode[pdf] = c
encodings = {
"StandardEncoding": std2unicode,
"MacRomanEncoding": mac2unicode,
"WinAnsiEncoding": win2unicode,
"PDFDocEncoding": pdf2unicode,
}
@classmethod
def get_encoding(
cls,
name: str,
diff: Optional[Iterable[object]] = None,
) -> Dict[int, str]:
cid2unicode = cls.encodings.get(name, cls.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = name2unicode(cast(str, x.name))
except (KeyError, ValueError):
# log.debug(str(e))
pass
cid += 1
return cid2unicode
|