File size: 5,611 Bytes
9b0f4a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import logging
from io import BytesIO
from typing import TYPE_CHECKING, BinaryIO, Optional, Union

from pdf2zh import settings
from pdf2zh.casting import safe_int
from pdf2zh.pdfexceptions import PDFException
from pdf2zh.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
from pdf2zh.psexceptions import PSEOF
from pdf2zh.psparser import KWD, PSKeyword, PSStackParser

if TYPE_CHECKING:
    from pdf2zh.pdfdocument import PDFDocument

log = logging.getLogger(__name__)


class PDFSyntaxError(PDFException):
    pass


# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
    """PDFParser fetch PDF objects from a file stream.
    It can handle indirect references by referring to
    a PDF document set by set_document method.
    It also reads XRefs at the end of every PDF file.

    Typical usage:
      parser = PDFParser(fp)
      parser.read_xref()
      parser.read_xref(fallback=True) # optional
      parser.set_document(doc)
      parser.seek(offset)
      parser.nextobject()

    """

    def __init__(self, fp: BinaryIO) -> None:
        PSStackParser.__init__(self, fp)
        self.doc: Optional[PDFDocument] = None
        self.fallback = False

    def set_document(self, doc: "PDFDocument") -> None:
        """Associates the parser with a PDFDocument object."""
        self.doc = doc

    KEYWORD_R = KWD(b"R")
    KEYWORD_NULL = KWD(b"null")
    KEYWORD_ENDOBJ = KWD(b"endobj")
    KEYWORD_STREAM = KWD(b"stream")
    KEYWORD_XREF = KWD(b"xref")
    KEYWORD_STARTXREF = KWD(b"startxref")

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        """Handles PDF-related keywords."""
        if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
            self.add_results(*self.pop(1))

        elif token is self.KEYWORD_ENDOBJ:
            self.add_results(*self.pop(4))

        elif token is self.KEYWORD_NULL:
            # null object
            self.push((pos, None))

        elif token is self.KEYWORD_R:
            # reference to indirect object
            if len(self.curstack) >= 2:
                (_, _object_id), _ = self.pop(2)
                object_id = safe_int(_object_id)
                if object_id is not None:
                    obj = PDFObjRef(self.doc, object_id)
                    self.push((pos, obj))

        elif token is self.KEYWORD_STREAM:
            # stream object
            ((_, dic),) = self.pop(1)
            dic = dict_value(dic)
            objlen = 0
            if not self.fallback:
                try:
                    objlen = int_value(dic["Length"])
                except KeyError:
                    if settings.STRICT:
                        raise PDFSyntaxError("/Length is undefined: %r" % dic)
            self.seek(pos)
            try:
                (_, line) = self.nextline()  # 'stream'
            except PSEOF:
                if settings.STRICT:
                    raise PDFSyntaxError("Unexpected EOF")
                return
            pos += len(line)
            self.fp.seek(pos)
            data = bytearray(self.fp.read(objlen))
            self.seek(pos + objlen)
            while 1:
                try:
                    (linepos, line) = self.nextline()
                except PSEOF:
                    if settings.STRICT:
                        raise PDFSyntaxError("Unexpected EOF")
                    break
                if b"endstream" in line:
                    i = line.index(b"endstream")
                    objlen += i
                    if self.fallback:
                        data += line[:i]
                    break
                objlen += len(line)
                if self.fallback:
                    data += line
            self.seek(pos + objlen)
            # XXX limit objlen not to exceed object boundary
            # log.debug(
            #     "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
            #     pos,
            #     objlen,
            #     dic,
            #     data[:10],
            # )
            assert self.doc is not None
            stream = PDFStream(dic, bytes(data), self.doc.decipher)
            self.push((pos, stream))

        else:
            # others
            self.push((pos, token))


class PDFStreamParser(PDFParser):
    """PDFStreamParser is used to parse PDF content streams
    that is contained in each page and has instructions
    for rendering the page. A reference to a PDF document is
    needed because a PDF content stream can also have
    indirect references to other objects in the same document.
    """

    def __init__(self, data: bytes) -> None:
        PDFParser.__init__(self, BytesIO(data))

    def flush(self) -> None:
        self.add_results(*self.popall())

    KEYWORD_OBJ = KWD(b"obj")

    def do_keyword(self, pos: int, token: PSKeyword) -> None:
        if token is self.KEYWORD_R:
            # reference to indirect object
            (_, _object_id), _ = self.pop(2)
            object_id = safe_int(_object_id)
            if object_id is not None:
                obj = PDFObjRef(self.doc, object_id)
                self.push((pos, obj))
            return

        elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
            if settings.STRICT:
                # See PDF Spec 3.4.6: Only the object values are stored in the
                # stream; the obj and endobj keywords are not used.
                raise PDFSyntaxError("Keyword endobj found in stream")
            return

        # others
        self.push((pos, token))