File size: 3,904 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import logging
import re
from pathlib import Path
from typing import Any, Iterator, List, Mapping, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.utilities.bibtex import BibtexparserWrapper

logger = logging.getLogger(__name__)


class BibtexLoader(BaseLoader):
    """Load a `bibtex` file.

    Each document represents one entry from the bibtex file.

    If a PDF file is present in the `file` bibtex field, the original PDF
    is loaded into the document text. If no such file entry is present,
    the `abstract` field is used instead.
    """

    def __init__(
        self,
        file_path: str,
        *,
        parser: Optional[BibtexparserWrapper] = None,
        max_docs: Optional[int] = None,
        max_content_chars: Optional[int] = 4_000,
        load_extra_metadata: bool = False,
        file_pattern: str = r"[^:]+\.pdf",
    ):
        """Initialize the BibtexLoader.

        Args:
            file_path: Path to the bibtex file.
            parser: The parser to use. If None, a default parser is used.
            max_docs: Max number of associated documents to load. Use -1 means
                           no limit.
            max_content_chars: Maximum number of characters to load from the PDF.
            load_extra_metadata: Whether to load extra metadata from the PDF.
            file_pattern: Regex pattern to match the file name in the bibtex.
        """
        self.file_path = file_path
        self.parser = parser or BibtexparserWrapper()
        self.max_docs = max_docs
        self.max_content_chars = max_content_chars
        self.load_extra_metadata = load_extra_metadata
        self.file_regex = re.compile(file_pattern)

    def _load_entry(self, entry: Mapping[str, Any]) -> Optional[Document]:
        import fitz

        parent_dir = Path(self.file_path).parent
        # regex is useful for Zotero flavor bibtex files
        file_names = self.file_regex.findall(entry.get("file", ""))
        if not file_names:
            return None
        texts: List[str] = []
        for file_name in file_names:
            try:
                with fitz.open(parent_dir / file_name) as f:
                    texts.extend(page.get_text() for page in f)
            except FileNotFoundError as e:
                logger.debug(e)
        content = "\n".join(texts) or entry.get("abstract", "")
        if self.max_content_chars:
            content = content[: self.max_content_chars]
        metadata = self.parser.get_metadata(entry, load_extra=self.load_extra_metadata)
        return Document(
            page_content=content,
            metadata=metadata,
        )

    def lazy_load(self) -> Iterator[Document]:
        """Load bibtex file using bibtexparser and get the article texts plus the
        article metadata.
        See https://bibtexparser.readthedocs.io/en/master/

        Returns:
            a list of documents with the document.page_content in text format
        """
        try:
            import fitz  # noqa: F401
        except ImportError:
            raise ImportError(
                "PyMuPDF package not found, please install it with "
                "`pip install pymupdf`"
            )

        entries = self.parser.load_bibtex_entries(self.file_path)
        if self.max_docs:
            entries = entries[: self.max_docs]
        for entry in entries:
            doc = self._load_entry(entry)
            if doc:
                yield doc

    def load(self) -> List[Document]:
        """Load bibtex file documents from the given bibtex file path.

        See https://bibtexparser.readthedocs.io/en/master/

        Args:
            file_path: the path to the bibtex file

        Returns:
            a list of documents with the document.page_content in text format
        """
        return list(self.lazy_load())