Spaces:
Runtime error
Runtime error
| import logging | |
| import re | |
| from pathlib import Path | |
| from typing import Any, Iterator, List, Mapping, Optional | |
| from langchain.docstore.document import Document | |
| from langchain.document_loaders.base import BaseLoader | |
| from langchain.utilities.bibtex import BibtexparserWrapper | |
| logger = logging.getLogger(__name__) | |
| class BibtexLoader(BaseLoader): | |
| """Load a `bibtex` file. | |
| Each document represents one entry from the bibtex file. | |
| If a PDF file is present in the `file` bibtex field, the original PDF | |
| is loaded into the document text. If no such file entry is present, | |
| the `abstract` field is used instead. | |
| """ | |
| def __init__( | |
| self, | |
| file_path: str, | |
| *, | |
| parser: Optional[BibtexparserWrapper] = None, | |
| max_docs: Optional[int] = None, | |
| max_content_chars: Optional[int] = 4_000, | |
| load_extra_metadata: bool = False, | |
| file_pattern: str = r"[^:]+\.pdf", | |
| ): | |
| """Initialize the BibtexLoader. | |
| Args: | |
| file_path: Path to the bibtex file. | |
| parser: The parser to use. If None, a default parser is used. | |
| max_docs: Max number of associated documents to load. Use -1 means | |
| no limit. | |
| max_content_chars: Maximum number of characters to load from the PDF. | |
| load_extra_metadata: Whether to load extra metadata from the PDF. | |
| file_pattern: Regex pattern to match the file name in the bibtex. | |
| """ | |
| self.file_path = file_path | |
| self.parser = parser or BibtexparserWrapper() | |
| self.max_docs = max_docs | |
| self.max_content_chars = max_content_chars | |
| self.load_extra_metadata = load_extra_metadata | |
| self.file_regex = re.compile(file_pattern) | |
| def _load_entry(self, entry: Mapping[str, Any]) -> Optional[Document]: | |
| import fitz | |
| parent_dir = Path(self.file_path).parent | |
| # regex is useful for Zotero flavor bibtex files | |
| file_names = self.file_regex.findall(entry.get("file", "")) | |
| if not file_names: | |
| return None | |
| texts: List[str] = [] | |
| for file_name in file_names: | |
| try: | |
| with fitz.open(parent_dir / file_name) as f: | |
| texts.extend(page.get_text() for page in f) | |
| except FileNotFoundError as e: | |
| logger.debug(e) | |
| content = "\n".join(texts) or entry.get("abstract", "") | |
| if self.max_content_chars: | |
| content = content[: self.max_content_chars] | |
| metadata = self.parser.get_metadata(entry, load_extra=self.load_extra_metadata) | |
| return Document( | |
| page_content=content, | |
| metadata=metadata, | |
| ) | |
| def lazy_load(self) -> Iterator[Document]: | |
| """Load bibtex file using bibtexparser and get the article texts plus the | |
| article metadata. | |
| See https://bibtexparser.readthedocs.io/en/master/ | |
| Returns: | |
| a list of documents with the document.page_content in text format | |
| """ | |
| try: | |
| import fitz # noqa: F401 | |
| except ImportError: | |
| raise ImportError( | |
| "PyMuPDF package not found, please install it with " | |
| "`pip install pymupdf`" | |
| ) | |
| entries = self.parser.load_bibtex_entries(self.file_path) | |
| if self.max_docs: | |
| entries = entries[: self.max_docs] | |
| for entry in entries: | |
| doc = self._load_entry(entry) | |
| if doc: | |
| yield doc | |
| def load(self) -> List[Document]: | |
| """Load bibtex file documents from the given bibtex file path. | |
| See https://bibtexparser.readthedocs.io/en/master/ | |
| Args: | |
| file_path: the path to the bibtex file | |
| Returns: | |
| a list of documents with the document.page_content in text format | |
| """ | |
| return list(self.lazy_load()) | |