Leopat's picture
upload src files
3b4f6eb verified
raw
history blame
605 Bytes
from langchain_community.document_loaders import GutenbergLoader
# class GutenbergEpubLoader(UnstructuredEPubLoader):
# """Load Project Gutenberg EPub files end remove boilerplate text and add metadata."""
# def preprocess_documents(self, documents: list[Document]) -> list[Document]:
# """Remove boilerplate text and add metadata to the documents."""
# for doc in tqdm(documents):
# doc.page_content = remove_boilerplate(doc.page_content)
# doc.metadata.update(get_metadata_from_text(doc.page_content))
# return documents