Spaces:

jfeng1115
/

marketing-analytics-bot

Runtime error

App Files Files Community

marketing-analytics-bot / langchain /document_loaders /evernote.py

jfeng1115

init commit

58d33f0 about 2 years ago

raw

history blame contribute delete

2.55 kB

	"""Load documents from Evernote.

	https://gist.github.com/foxmask/7b29c43a161e001ff04afdb2f181e31c
	"""
	import hashlib
	from base64 import b64decode
	from time import strptime
	from typing import Any, Dict, List

	from langchain.docstore.document import Document
	from langchain.document_loaders.base import BaseLoader


	def _parse_content(content: str) -> str:
	from pypandoc import convert_text

	text = convert_text(content, "org", format="html")
	return text


	def _parse_resource(resource: list) -> dict:
	rsc_dict: Dict[str, Any] = {}
	for elem in resource:
	if elem.tag == "data":
	# Some times elem.text is None
	rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b""
	rsc_dict["hash"] = hashlib.md5(rsc_dict[elem.tag]).hexdigest()
	else:
	rsc_dict[elem.tag] = elem.text

	return rsc_dict


	def _parse_note(note: List) -> dict:
	note_dict: Dict[str, Any] = {}
	resources = []
	for elem in note:
	if elem.tag == "content":
	note_dict[elem.tag] = _parse_content(elem.text)
	# A copy of original content
	note_dict["content-raw"] = elem.text
	elif elem.tag == "resource":
	resources.append(_parse_resource(elem))
	elif elem.tag == "created" or elem.tag == "updated":
	note_dict[elem.tag] = strptime(elem.text, "%Y%m%dT%H%M%SZ")
	else:
	note_dict[elem.tag] = elem.text

	note_dict["resource"] = resources

	return note_dict


	def _parse_note_xml(xml_file: str) -> str:
	"""Parse Evernote xml."""
	# Without huge_tree set to True, parser may complain about huge text node
	# Try to recover, because there may be " ", which will cause
	# "XMLSyntaxError: Entity 'nbsp' not defined"
	from lxml import etree

	context = etree.iterparse(
	xml_file, encoding="utf-8", strip_cdata=False, huge_tree=True, recover=True
	)
	result_string = ""
	for action, elem in context:
	if elem.tag == "note":
	result_string += _parse_note(elem)["content"]
	return result_string


	class EverNoteLoader(BaseLoader):
	"""Loader to load in EverNote files.."""

	def __init__(self, file_path: str):
	"""Initialize with file path."""
	self.file_path = file_path

	def load(self) -> List[Document]:
	"""Load document from EverNote file."""
	text = _parse_note_xml(self.file_path)
	metadata = {"source": self.file_path}
	return [Document(page_content=text, metadata=metadata)]