Spaces:

zhangyi617
/

webui

Runtime error

App Files Files Community

webui / langchain /document_loaders /bibtex.py

zhangyi617

Upload folder using huggingface_hub

129cd69 almost 2 years ago

raw

history blame contribute delete

3.9 kB

	import logging
	import re
	from pathlib import Path
	from typing import Any, Iterator, List, Mapping, Optional

	from langchain.docstore.document import Document
	from langchain.document_loaders.base import BaseLoader
	from langchain.utilities.bibtex import BibtexparserWrapper

	logger = logging.getLogger(__name__)


	class BibtexLoader(BaseLoader):
	"""Load a `bibtex` file.

	Each document represents one entry from the bibtex file.

	If a PDF file is present in the `file` bibtex field, the original PDF
	is loaded into the document text. If no such file entry is present,
	the `abstract` field is used instead.
	"""

	def __init__(
	self,
	file_path: str,
	*,
	parser: Optional[BibtexparserWrapper] = None,
	max_docs: Optional[int] = None,
	max_content_chars: Optional[int] = 4_000,
	load_extra_metadata: bool = False,
	file_pattern: str = r"[^:]+\.pdf",
	):
	"""Initialize the BibtexLoader.

	Args:
	file_path: Path to the bibtex file.
	parser: The parser to use. If None, a default parser is used.
	max_docs: Max number of associated documents to load. Use -1 means
	no limit.
	max_content_chars: Maximum number of characters to load from the PDF.
	load_extra_metadata: Whether to load extra metadata from the PDF.
	file_pattern: Regex pattern to match the file name in the bibtex.
	"""
	self.file_path = file_path
	self.parser = parser or BibtexparserWrapper()
	self.max_docs = max_docs
	self.max_content_chars = max_content_chars
	self.load_extra_metadata = load_extra_metadata
	self.file_regex = re.compile(file_pattern)

	def _load_entry(self, entry: Mapping[str, Any]) -> Optional[Document]:
	import fitz

	parent_dir = Path(self.file_path).parent
	# regex is useful for Zotero flavor bibtex files
	file_names = self.file_regex.findall(entry.get("file", ""))
	if not file_names:
	return None
	texts: List[str] = []
	for file_name in file_names:
	try:
	with fitz.open(parent_dir / file_name) as f:
	texts.extend(page.get_text() for page in f)
	except FileNotFoundError as e:
	logger.debug(e)
	content = "\n".join(texts) or entry.get("abstract", "")
	if self.max_content_chars:
	content = content[: self.max_content_chars]
	metadata = self.parser.get_metadata(entry, load_extra=self.load_extra_metadata)
	return Document(
	page_content=content,
	metadata=metadata,
	)

	def lazy_load(self) -> Iterator[Document]:
	"""Load bibtex file using bibtexparser and get the article texts plus the
	article metadata.
	See https://bibtexparser.readthedocs.io/en/master/

	Returns:
	a list of documents with the document.page_content in text format
	"""
	try:
	import fitz # noqa: F401
	except ImportError:
	raise ImportError(
	"PyMuPDF package not found, please install it with "
	"`pip install pymupdf`"
	)

	entries = self.parser.load_bibtex_entries(self.file_path)
	if self.max_docs:
	entries = entries[: self.max_docs]
	for entry in entries:
	doc = self._load_entry(entry)
	if doc:
	yield doc

	def load(self) -> List[Document]:
	"""Load bibtex file documents from the given bibtex file path.

	See https://bibtexparser.readthedocs.io/en/master/

	Args:
	file_path: the path to the bibtex file

	Returns:
	a list of documents with the document.page_content in text format
	"""
	return list(self.lazy_load())