Spaces:

zhangyi617
/

webui

Runtime error

App Files Files Community

webui / langchain /document_loaders /parsers /pdf.py

zhangyi617

Upload folder using huggingface_hub

129cd69 over 1 year ago

raw

history blame contribute delete

20.5 kB

	"""Module contains common parsers for PDFs."""
	from __future__ import annotations

	import warnings
	from typing import (
	TYPE_CHECKING,
	Any,
	Iterable,
	Iterator,
	Mapping,
	Optional,
	Sequence,
	Union,
	)
	from urllib.parse import urlparse

	import numpy as np
	from langchain_core.documents import Document

	from langchain.document_loaders.base import BaseBlobParser
	from langchain.document_loaders.blob_loaders import Blob

	if TYPE_CHECKING:
	import fitz.fitz
	import pdfminer.layout
	import pdfplumber.page
	import pypdf._page
	import pypdfium2._helpers.page


	_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
	_PDF_FILTER_WITHOUT_LOSS = [
	"LZWDecode",
	"LZW",
	"FlateDecode",
	"Fl",
	"ASCII85Decode",
	"A85",
	"ASCIIHexDecode",
	"AHx",
	"RunLengthDecode",
	"RL",
	"CCITTFaxDecode",
	"CCF",
	"JBIG2Decode",
	]


	def extract_from_images_with_rapidocr(
	images: Sequence[Union[Iterable[np.ndarray], bytes]]
	) -> str:
	"""Extract text from images with RapidOCR.

	Args:
	images: Images to extract text from.

	Returns:
	Text extracted from images.

	Raises:
	ImportError: If `rapidocr-onnxruntime` package is not installed.
	"""
	try:
	from rapidocr_onnxruntime import RapidOCR
	except ImportError:
	raise ImportError(
	"`rapidocr-onnxruntime` package not found, please install it with "
	"`pip install rapidocr-onnxruntime`"
	)
	ocr = RapidOCR()
	text = ""
	for img in images:
	result, _ = ocr(img)
	if result:
	result = [text[1] for text in result]
	text += "\n".join(result)
	return text


	class PyPDFParser(BaseBlobParser):
	"""Load `PDF` using `pypdf`"""

	def __init__(
	self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False
	):
	self.password = password
	self.extract_images = extract_images

	def lazy_parse(self, blob: Blob) -> Iterator[Document]:
	"""Lazily parse the blob."""
	import pypdf

	with blob.as_bytes_io() as pdf_file_obj:
	pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
	yield from [
	Document(
	page_content=page.extract_text()
	+ self._extract_images_from_page(page),
	metadata={"source": blob.source, "page": page_number},
	)
	for page_number, page in enumerate(pdf_reader.pages)
	]

	def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
	"""Extract images from page and get the text with RapidOCR."""
	if not self.extract_images or "/XObject" not in page["/Resources"].keys():
	return ""

	xObject = page["/Resources"]["/XObject"].get_object() # type: ignore
	images = []
	for obj in xObject:
	if xObject[obj]["/Subtype"] == "/Image":
	if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
	height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]

	images.append(
	np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
	height, width, -1
	)
	)
	elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
	images.append(xObject[obj].get_data())
	else:
	warnings.warn("Unknown PDF Filter!")
	return extract_from_images_with_rapidocr(images)


	class PDFMinerParser(BaseBlobParser):
	"""Parse `PDF` using `PDFMiner`."""

	def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
	"""Initialize a parser based on PDFMiner.

	Args:
	extract_images: Whether to extract images from PDF.
	concatenate_pages: If True, concatenate all PDF pages into one a single
	document. Otherwise, return one document per page.
	"""
	self.extract_images = extract_images
	self.concatenate_pages = concatenate_pages

	def lazy_parse(self, blob: Blob) -> Iterator[Document]:
	"""Lazily parse the blob."""

	if not self.extract_images:
	from pdfminer.high_level import extract_text

	with blob.as_bytes_io() as pdf_file_obj:
	if self.concatenate_pages:
	text = extract_text(pdf_file_obj)
	metadata = {"source": blob.source}
	yield Document(page_content=text, metadata=metadata)
	else:
	from pdfminer.pdfpage import PDFPage

	pages = PDFPage.get_pages(pdf_file_obj)
	for i, _ in enumerate(pages):
	text = extract_text(pdf_file_obj, page_numbers=[i])
	metadata = {"source": blob.source, "page": str(i)}
	yield Document(page_content=text, metadata=metadata)
	else:
	import io

	from pdfminer.converter import PDFPageAggregator, TextConverter
	from pdfminer.layout import LAParams
	from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
	from pdfminer.pdfpage import PDFPage

	text_io = io.StringIO()
	with blob.as_bytes_io() as pdf_file_obj:
	pages = PDFPage.get_pages(pdf_file_obj)
	rsrcmgr = PDFResourceManager()
	device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
	device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
	interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
	interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
	for i, page in enumerate(pages):
	interpreter_for_text.process_page(page)
	interpreter_for_image.process_page(page)
	content = text_io.getvalue() + self._extract_images_from_page(
	device_for_image.get_result()
	)
	text_io.truncate(0)
	text_io.seek(0)
	metadata = {"source": blob.source, "page": str(i)}
	yield Document(page_content=content, metadata=metadata)

	def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
	"""Extract images from page and get the text with RapidOCR."""
	import pdfminer

	def get_image(layout_object: Any) -> Any:
	if isinstance(layout_object, pdfminer.layout.LTImage):
	return layout_object
	if isinstance(layout_object, pdfminer.layout.LTContainer):
	for child in layout_object:
	return get_image(child)
	else:
	return None

	images = []

	for img in list(filter(bool, map(get_image, page))):
	if img.stream["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
	images.append(
	np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
	img.stream["Height"], img.stream["Width"], -1
	)
	)
	elif img.stream["Filter"].name in _PDF_FILTER_WITH_LOSS:
	images.append(img.stream.get_data())
	else:
	warnings.warn("Unknown PDF Filter!")
	return extract_from_images_with_rapidocr(images)


	class PyMuPDFParser(BaseBlobParser):
	"""Parse `PDF` using `PyMuPDF`."""

	def __init__(
	self,
	text_kwargs: Optional[Mapping[str, Any]] = None,
	extract_images: bool = False,
	) -> None:
	"""Initialize the parser.

	Args:
	text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
	"""
	self.text_kwargs = text_kwargs or {}
	self.extract_images = extract_images

	def lazy_parse(self, blob: Blob) -> Iterator[Document]:
	"""Lazily parse the blob."""
	import fitz

	with blob.as_bytes_io() as file_path:
	doc = fitz.open(file_path) # open document

	yield from [
	Document(
	page_content=page.get_text(**self.text_kwargs)
	+ self._extract_images_from_page(doc, page),
	metadata=dict(
	{
	"source": blob.source,
	"file_path": blob.source,
	"page": page.number,
	"total_pages": len(doc),
	},
	**{
	k: doc.metadata[k]
	for k in doc.metadata
	if type(doc.metadata[k]) in [str, int]
	},
	),
	)
	for page in doc
	]

	def _extract_images_from_page(
	self, doc: fitz.fitz.Document, page: fitz.fitz.Page
	) -> str:
	"""Extract images from page and get the text with RapidOCR."""
	if not self.extract_images:
	return ""
	import fitz

	img_list = page.get_images()
	imgs = []
	for img in img_list:
	xref = img[0]
	pix = fitz.Pixmap(doc, xref)
	imgs.append(
	np.frombuffer(pix.samples, dtype=np.uint8).reshape(
	pix.height, pix.width, -1
	)
	)
	return extract_from_images_with_rapidocr(imgs)


	class PyPDFium2Parser(BaseBlobParser):
	"""Parse `PDF` with `PyPDFium2`."""

	def __init__(self, extract_images: bool = False) -> None:
	"""Initialize the parser."""
	try:
	import pypdfium2 # noqa:F401
	except ImportError:
	raise ImportError(
	"pypdfium2 package not found, please install it with"
	" `pip install pypdfium2`"
	)
	self.extract_images = extract_images

	def lazy_parse(self, blob: Blob) -> Iterator[Document]:
	"""Lazily parse the blob."""
	import pypdfium2

	# pypdfium2 is really finicky with respect to closing things,
	# if done incorrectly creates seg faults.
	with blob.as_bytes_io() as file_path:
	pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
	try:
	for page_number, page in enumerate(pdf_reader):
	text_page = page.get_textpage()
	content = text_page.get_text_range()
	text_page.close()
	content += "\n" + self._extract_images_from_page(page)
	page.close()
	metadata = {"source": blob.source, "page": page_number}
	yield Document(page_content=content, metadata=metadata)
	finally:
	pdf_reader.close()

	def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
	"""Extract images from page and get the text with RapidOCR."""
	if not self.extract_images:
	return ""

	import pypdfium2.raw as pdfium_c

	images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))

	images = list(map(lambda x: x.get_bitmap().to_numpy(), images))
	return extract_from_images_with_rapidocr(images)


	class PDFPlumberParser(BaseBlobParser):
	"""Parse `PDF` with `PDFPlumber`."""

	def __init__(
	self,
	text_kwargs: Optional[Mapping[str, Any]] = None,
	dedupe: bool = False,
	extract_images: bool = False,
	) -> None:
	"""Initialize the parser.

	Args:
	text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
	dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
	"""
	self.text_kwargs = text_kwargs or {}
	self.dedupe = dedupe
	self.extract_images = extract_images

	def lazy_parse(self, blob: Blob) -> Iterator[Document]:
	"""Lazily parse the blob."""
	import pdfplumber

	with blob.as_bytes_io() as file_path:
	doc = pdfplumber.open(file_path) # open document

	yield from [
	Document(
	page_content=self._process_page_content(page)
	+ "\n"
	+ self._extract_images_from_page(page),
	metadata=dict(
	{
	"source": blob.source,
	"file_path": blob.source,
	"page": page.page_number - 1,
	"total_pages": len(doc.pages),
	},
	**{
	k: doc.metadata[k]
	for k in doc.metadata
	if type(doc.metadata[k]) in [str, int]
	},
	),
	)
	for page in doc.pages
	]

	def _process_page_content(self, page: pdfplumber.page.Page) -> str:
	"""Process the page content based on dedupe."""
	if self.dedupe:
	return page.dedupe_chars().extract_text(**self.text_kwargs)
	return page.extract_text(**self.text_kwargs)

	def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
	"""Extract images from page and get the text with RapidOCR."""
	if not self.extract_images:
	return ""

	images = []
	for img in page.images:
	if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
	images.append(
	np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
	img["stream"]["Height"], img["stream"]["Width"], -1
	)
	)
	elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
	images.append(img["stream"].get_data())
	else:
	warnings.warn("Unknown PDF Filter!")

	return extract_from_images_with_rapidocr(images)


	class AmazonTextractPDFParser(BaseBlobParser):
	"""Send `PDF` files to `Amazon Textract` and parse them.

	For parsing multi-page PDFs, they have to reside on S3.

	The AmazonTextractPDFLoader calls the
	[Amazon Textract Service](https://aws.amazon.com/textract/)
	to convert PDFs into a Document structure.
	Single and multi-page documents are supported with up to 3000 pages
	and 512 MB of size.

	For the call to be successful an AWS account is required,
	similar to the
	[AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
	requirements.

	Besides the AWS configuration, it is very similar to the other PDF
	loaders, while also supporting JPEG, PNG and TIFF and non-native
	PDF formats.

	```python
	from langchain.document_loaders import AmazonTextractPDFLoader
	loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
	documents = loader.load()
	```

	One feature is the linearization of the output.
	When using the features LAYOUT, FORMS or TABLES together with Textract

	```python
	from langchain.document_loaders import AmazonTextractPDFLoader
	# you can mix and match each of the features
	loader=AmazonTextractPDFLoader(
	"example_data/alejandro_rosalez_sample-small.jpeg",
	textract_features=["TABLES", "LAYOUT"])
	documents = loader.load()
	```

	it will generate output that formats the text in reading order and
	try to output the information in a tabular structure or
	output the key/value pairs with a colon (key: value).
	This helps most LLMs to achieve better accuracy when
	processing these texts.

	"""

	def __init__(
	self,
	textract_features: Optional[Sequence[int]] = None,
	client: Optional[Any] = None,
	) -> None:
	"""Initializes the parser.

	Args:
	textract_features: Features to be used for extraction, each feature
	should be passed as an int that conforms to the enum
	`Textract_Features`, see `amazon-textract-caller` pkg
	client: boto3 textract client
	"""

	try:
	import textractcaller as tc
	import textractor.entities.document as textractor

	self.tc = tc
	self.textractor = textractor

	if textract_features is not None:
	self.textract_features = [
	tc.Textract_Features(f) for f in textract_features
	]
	else:
	self.textract_features = []
	except ImportError:
	raise ImportError(
	"Could not import amazon-textract-caller or "
	"amazon-textract-textractor python package. Please install it "
	"with `pip install amazon-textract-caller` & "
	"`pip install amazon-textract-textractor`."
	)

	if not client:
	try:
	import boto3

	self.boto3_textract_client = boto3.client("textract")
	except ImportError:
	raise ImportError(
	"Could not import boto3 python package. "
	"Please install it with `pip install boto3`."
	)
	else:
	self.boto3_textract_client = client

	def lazy_parse(self, blob: Blob) -> Iterator[Document]:
	"""Iterates over the Blob pages and returns an Iterator with a Document
	for each page, like the other parsers If multi-page document, blob.path
	has to be set to the S3 URI and for single page docs
	the blob.data is taken
	"""

	url_parse_result = urlparse(str(blob.path)) if blob.path else None
	# Either call with S3 path (multi-page) or with bytes (single-page)
	if (
	url_parse_result
	and url_parse_result.scheme == "s3"
	and url_parse_result.netloc
	):
	textract_response_json = self.tc.call_textract(
	input_document=str(blob.path),
	features=self.textract_features,
	boto3_textract_client=self.boto3_textract_client,
	)
	else:
	textract_response_json = self.tc.call_textract(
	input_document=blob.as_bytes(),
	features=self.textract_features,
	call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
	boto3_textract_client=self.boto3_textract_client,
	)

	document = self.textractor.Document.open(textract_response_json)

	linearizer_config = self.textractor.TextLinearizationConfig(
	hide_figure_layout=True,
	title_prefix="# ",
	section_header_prefix="## ",
	list_element_prefix="*",
	)
	for idx, page in enumerate(document.pages):
	yield Document(
	page_content=page.get_text(config=linearizer_config),
	metadata={"source": blob.source, "page": idx + 1},
	)


	class DocumentIntelligenceParser(BaseBlobParser):
	"""Loads a PDF with Azure Document Intelligence
	(formerly Forms Recognizer) and chunks at character level."""

	def __init__(self, client: Any, model: str):
	self.client = client
	self.model = model

	def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]:
	for p in result.pages:
	content = " ".join([line.content for line in p.lines])

	d = Document(
	page_content=content,
	metadata={
	"source": blob.source,
	"page": p.page_number,
	},
	)
	yield d

	def lazy_parse(self, blob: Blob) -> Iterator[Document]:
	"""Lazily parse the blob."""

	with blob.as_bytes_io() as file_obj:
	poller = self.client.begin_analyze_document(self.model, file_obj)
	result = poller.result()

	docs = self._generate_docs(blob, result)

	yield from docs