File size: 605 Bytes
3b4f6eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from langchain_community.document_loaders import GutenbergLoader




# class GutenbergEpubLoader(UnstructuredEPubLoader):
#     """Load Project Gutenberg EPub files end remove boilerplate text and add metadata."""
    






    # def preprocess_documents(self, documents: list[Document]) -> list[Document]:
    #     """Remove boilerplate text and add metadata to the documents."""
    #     for doc in tqdm(documents):
    #         doc.page_content = remove_boilerplate(doc.page_content)
    #         doc.metadata.update(get_metadata_from_text(doc.page_content))
        
    #     return documents