PDFTranslate / pdf2zh /pdfparser.py
sanbo
update sth. at 2024-11-26 16:15:47
9b0f4a0
raw
history blame
5.61 kB
import logging
from io import BytesIO
from typing import TYPE_CHECKING, BinaryIO, Optional, Union
from pdf2zh import settings
from pdf2zh.casting import safe_int
from pdf2zh.pdfexceptions import PDFException
from pdf2zh.pdftypes import PDFObjRef, PDFStream, dict_value, int_value
from pdf2zh.psexceptions import PSEOF
from pdf2zh.psparser import KWD, PSKeyword, PSStackParser
if TYPE_CHECKING:
from pdf2zh.pdfdocument import PDFDocument
log = logging.getLogger(__name__)
class PDFSyntaxError(PDFException):
pass
# PDFParser stack holds all the base types plus PDFStream, PDFObjRef, and None
class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
"""PDFParser fetch PDF objects from a file stream.
It can handle indirect references by referring to
a PDF document set by set_document method.
It also reads XRefs at the end of every PDF file.
Typical usage:
parser = PDFParser(fp)
parser.read_xref()
parser.read_xref(fallback=True) # optional
parser.set_document(doc)
parser.seek(offset)
parser.nextobject()
"""
def __init__(self, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
self.doc: Optional[PDFDocument] = None
self.fallback = False
def set_document(self, doc: "PDFDocument") -> None:
"""Associates the parser with a PDFDocument object."""
self.doc = doc
KEYWORD_R = KWD(b"R")
KEYWORD_NULL = KWD(b"null")
KEYWORD_ENDOBJ = KWD(b"endobj")
KEYWORD_STREAM = KWD(b"stream")
KEYWORD_XREF = KWD(b"xref")
KEYWORD_STARTXREF = KWD(b"startxref")
def do_keyword(self, pos: int, token: PSKeyword) -> None:
"""Handles PDF-related keywords."""
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1))
elif token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4))
elif token is self.KEYWORD_NULL:
# null object
self.push((pos, None))
elif token is self.KEYWORD_R:
# reference to indirect object
if len(self.curstack) >= 2:
(_, _object_id), _ = self.pop(2)
object_id = safe_int(_object_id)
if object_id is not None:
obj = PDFObjRef(self.doc, object_id)
self.push((pos, obj))
elif token is self.KEYWORD_STREAM:
# stream object
((_, dic),) = self.pop(1)
dic = dict_value(dic)
objlen = 0
if not self.fallback:
try:
objlen = int_value(dic["Length"])
except KeyError:
if settings.STRICT:
raise PDFSyntaxError("/Length is undefined: %r" % dic)
self.seek(pos)
try:
(_, line) = self.nextline() # 'stream'
except PSEOF:
if settings.STRICT:
raise PDFSyntaxError("Unexpected EOF")
return
pos += len(line)
self.fp.seek(pos)
data = bytearray(self.fp.read(objlen))
self.seek(pos + objlen)
while 1:
try:
(linepos, line) = self.nextline()
except PSEOF:
if settings.STRICT:
raise PDFSyntaxError("Unexpected EOF")
break
if b"endstream" in line:
i = line.index(b"endstream")
objlen += i
if self.fallback:
data += line[:i]
break
objlen += len(line)
if self.fallback:
data += line
self.seek(pos + objlen)
# XXX limit objlen not to exceed object boundary
# log.debug(
# "Stream: pos=%d, objlen=%d, dic=%r, data=%r...",
# pos,
# objlen,
# dic,
# data[:10],
# )
assert self.doc is not None
stream = PDFStream(dic, bytes(data), self.doc.decipher)
self.push((pos, stream))
else:
# others
self.push((pos, token))
class PDFStreamParser(PDFParser):
"""PDFStreamParser is used to parse PDF content streams
that is contained in each page and has instructions
for rendering the page. A reference to a PDF document is
needed because a PDF content stream can also have
indirect references to other objects in the same document.
"""
def __init__(self, data: bytes) -> None:
PDFParser.__init__(self, BytesIO(data))
def flush(self) -> None:
self.add_results(*self.popall())
KEYWORD_OBJ = KWD(b"obj")
def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_R:
# reference to indirect object
(_, _object_id), _ = self.pop(2)
object_id = safe_int(_object_id)
if object_id is not None:
obj = PDFObjRef(self.doc, object_id)
self.push((pos, obj))
return
elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
if settings.STRICT:
# See PDF Spec 3.4.6: Only the object values are stored in the
# stream; the obj and endobj keywords are not used.
raise PDFSyntaxError("Keyword endobj found in stream")
return
# others
self.push((pos, token))