File size: 4,014 Bytes
9b0f4a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import logging
import re
from typing import Dict, Iterable, Optional, cast

from pdf2zh.glyphlist import glyphname2unicode
from pdf2zh.latin_enc import ENCODING
from pdf2zh.pdfexceptions import PDFKeyError
from pdf2zh.psparser import PSLiteral

HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")

log = logging.getLogger(__name__)


def name2unicode(name: str) -> str:
    """Converts Adobe glyph names to Unicode numbers.

    In contrast to the specification, this raises a KeyError instead of return
    an empty string when the key is unknown.
    This way the caller must explicitly define what to do
    when there is not a match.

    Reference:
    https://github.com/adobe-type-tools/agl-specification#2-the-mapping

    :returns unicode character if name resembles something,
    otherwise a KeyError
    """
    if not isinstance(name, str):
        raise PDFKeyError(
            'Could not convert unicode name "%s" to character because '
            "it should be of type str but is of type %s" % (name, type(name)),
        )

    name = name.split(".")[0]
    components = name.split("_")

    if len(components) > 1:
        return "".join(map(name2unicode, components))

    elif name in glyphname2unicode:
        return glyphname2unicode[name]

    elif name.startswith("uni"):
        name_without_uni = name.strip("uni")

        if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
            unicode_digits = [
                int(name_without_uni[i : i + 4], base=16)
                for i in range(0, len(name_without_uni), 4)
            ]
            for digit in unicode_digits:
                raise_key_error_for_invalid_unicode(digit)
            characters = map(chr, unicode_digits)
            return "".join(characters)

    elif name.startswith("u"):
        name_without_u = name.strip("u")

        if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
            unicode_digit = int(name_without_u, base=16)
            raise_key_error_for_invalid_unicode(unicode_digit)
            return chr(unicode_digit)

    raise PDFKeyError(
        'Could not convert unicode name "%s" to character because '
        "it does not match specification" % name,
    )


def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
    """Unicode values should not be in the range D800 through DFFF because
    that is used for surrogate pairs in UTF-16

    :raises KeyError if unicode digit is invalid
    """
    if 55295 < unicode_digit < 57344:
        raise PDFKeyError(
            "Unicode digit %d is invalid because "
            "it is in the range D800 through DFFF" % unicode_digit,
        )


class EncodingDB:
    std2unicode: Dict[int, str] = {}
    mac2unicode: Dict[int, str] = {}
    win2unicode: Dict[int, str] = {}
    pdf2unicode: Dict[int, str] = {}
    for name, std, mac, win, pdf in ENCODING:
        c = name2unicode(name)
        if std:
            std2unicode[std] = c
        if mac:
            mac2unicode[mac] = c
        if win:
            win2unicode[win] = c
        if pdf:
            pdf2unicode[pdf] = c

    encodings = {
        "StandardEncoding": std2unicode,
        "MacRomanEncoding": mac2unicode,
        "WinAnsiEncoding": win2unicode,
        "PDFDocEncoding": pdf2unicode,
    }

    @classmethod
    def get_encoding(
        cls,
        name: str,
        diff: Optional[Iterable[object]] = None,
    ) -> Dict[int, str]:
        cid2unicode = cls.encodings.get(name, cls.std2unicode)
        if diff:
            cid2unicode = cid2unicode.copy()
            cid = 0
            for x in diff:
                if isinstance(x, int):
                    cid = x
                elif isinstance(x, PSLiteral):
                    try:
                        cid2unicode[cid] = name2unicode(cast(str, x.name))
                    except (KeyError, ValueError):
                        # log.debug(str(e))
                        pass
                    cid += 1
        return cid2unicode