Spaces:
Runtime error
Runtime error
File size: 1,061 Bytes
bfb2c2a 21c3825 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
"""Load an epub file into a list of documents."""
from dataclasses import dataclass
from pathlib import Path
from typing import List, Union
from epub2txt import epub2txt
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from loguru import logger
@dataclass
class EpubLoader(BaseLoader):
"""Load an epub file into a list of documents.
Args:
file_path: file path or url to epub
Returns:
self.load() -> list of Documents
"""
file_path: Union[str, Path]
def load(self) -> List[Document]:
"""Load data into document objects."""
try:
texts = epub2txt(self.file_path, outputlist=True)
ch_titles = epub2txt.content_titles
except Exception as exc:
logger.error(exc)
raise
docs = []
for title, text in zip(ch_titles, texts):
metadata = {"source": self.file_path, "ch.": title}
docs.append(Document(page_content=text, metadata=metadata))
return docs
|