File size: 5,749 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""Load documents from Evernote.

https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
"""
import hashlib
import logging
from base64 import b64decode
from time import strptime
from typing import Any, Dict, Iterator, List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


class EverNoteLoader(BaseLoader):
    """Load from `EverNote`.

    Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
    Instructions on producing this file can be found at
    https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML

    Currently only the plain text in the note is extracted and stored as the contents
    of the Document, any non content metadata (e.g. 'author', 'created', 'updated' etc.
    but not 'content-raw' or 'resource') tags on the note will be extracted and stored
    as metadata on the Document.

    Args:
        file_path (str): The path to the notebook export with a .enex extension
        load_single_document (bool): Whether or not to concatenate the content of all
            notes into a single long Document.
        If this is set to True (default) then the only metadata on the document will be
            the 'source' which contains the file name of the export.
    """  # noqa: E501

    def __init__(self, file_path: str, load_single_document: bool = True):
        """Initialize with file path."""
        self.file_path = file_path
        self.load_single_document = load_single_document

    def load(self) -> List[Document]:
        """Load documents from EverNote export file."""
        documents = [
            Document(
                page_content=note["content"],
                metadata={
                    **{
                        key: value
                        for key, value in note.items()
                        if key not in ["content", "content-raw", "resource"]
                    },
                    **{"source": self.file_path},
                },
            )
            for note in self._parse_note_xml(self.file_path)
            if note.get("content") is not None
        ]

        if not self.load_single_document:
            return documents

        return [
            Document(
                page_content="".join([document.page_content for document in documents]),
                metadata={"source": self.file_path},
            )
        ]

    @staticmethod
    def _parse_content(content: str) -> str:
        try:
            import html2text

            return html2text.html2text(content).strip()
        except ImportError as e:
            raise ImportError(
                "Could not import `html2text`. Although it is not a required package "
                "to use Langchain, using the EverNote loader requires `html2text`. "
                "Please install `html2text` via `pip install html2text` and try again."
            ) from e

    @staticmethod
    def _parse_resource(resource: list) -> dict:
        rsc_dict: Dict[str, Any] = {}
        for elem in resource:
            if elem.tag == "data":
                # Sometimes elem.text is None
                rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
                rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
            else:
                rsc_dict[elem.tag] = elem.text

        return rsc_dict

    @staticmethod
    def _parse_note(note: List, prefix: Optional[str] = None) -> dict:
        note_dict: Dict[str, Any] = {}
        resources = []

        def add_prefix(element_tag: str) -> str:
            if prefix is None:
                return element_tag
            return f"{prefix}.{element_tag}"

        for elem in note:
            if elem.tag == "content":
                note_dict[elem.tag] = EverNoteLoader._parse_content(elem.text)
                # A copy of original content
                note_dict["content-raw"] = elem.text
            elif elem.tag == "resource":
                resources.append(EverNoteLoader._parse_resource(elem))
            elif elem.tag == "created" or elem.tag == "updated":
                note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
            elif elem.tag == "note-attributes":
                additional_attributes = EverNoteLoader._parse_note(
                    elem, elem.tag
                )  # Recursively enter the note-attributes tag
                note_dict.update(additional_attributes)
            else:
                note_dict[elem.tag] = elem.text

        if len(resources) > 0:
            note_dict["resource"] = resources

        return {add_prefix(key): value for key, value in note_dict.items()}

    @staticmethod
    def _parse_note_xml(xml_file: str) -> Iterator[Dict[str, Any]]:
        """Parse Evernote xml."""
        # Without huge_tree set to True, parser may complain about huge text node
        # Try to recover, because there may be " ", which will cause
        # "XMLSyntaxError: Entity 'nbsp' not defined"
        try:
            from lxml import etree
        except ImportError as e:
            logger.error(
                "Could not import `lxml`. Although it is not a required package to use "
                "Langchain, using the EverNote loader requires `lxml`. Please install "
                "`lxml` via `pip install lxml` and try again."
            )
            raise e

        context = etree.iterparse(
            xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
        )

        for action, elem in context:
            if elem.tag == "note":
                yield EverNoteLoader._parse_note(elem)