File size: 3,046 Bytes
3b4f6eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import re
from tqdm import tqdm
# from llama_index.core import Document
from langchain_core.documents import Document
import json


def remove_boilerplate(document_text: str) -> str:
        
        # Remove the boilerplate text before the actual content
        start_content_match = re.search(r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK [^*]+ \*\*\*', document_text)
        if start_content_match:
            document_text = document_text[start_content_match.end():].strip()
        
        end_content_match = re.search(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK [^*]+ \*\*\*', document_text)
        if end_content_match:
            document_text = document_text[:end_content_match.start()].strip()
        return document_text




def get_metadata_from_text(document_text:str) -> dict[str, str]:

        # Extract metadata using regular expressions
        title_match = re.search(r'Title\s*:\s*(.*)', document_text)
        author_match = re.search(r'Author\s*:\s*(.*)', document_text)
        release_date_match = re.search(r'Release date\s*:\s*(.*)', document_text)
        language_match = re.search(r'Language\s*:\s*(.*)', document_text)
        credits_match = re.search(r'Credits\s*:\s*(.*)', document_text)

        # Add metadata to the document
        metadata = {}
        if title_match:
            metadata['Title'] = title_match.group(1).strip()
        if author_match:
            metadata['Author'] = author_match.group(1).strip()
        if release_date_match:
            metadata['Release date'] = release_date_match.group(1).strip()
        if language_match:
            metadata['Language'] = language_match.group(1).strip()
        if credits_match:
            metadata['Credits'] = credits_match.group(1).strip()
        
        return metadata



def preprocess_documents(documents: list[Document]):

    for doc in tqdm(documents):
        doc.metadata.update(get_metadata_from_text(doc.page_content))
        doc.page_content = remove_boilerplate(doc.page_content)
    
    return documents



# Function to deserialize documents from JSON
def deserialize_documents(serialized_docs):
    documents = []
    for doc in json.loads(serialized_docs):
        documents.append(Document(page_content=doc["page_content"], metadata=doc["metadata"]))
    return documents

# Load the documents from a file
def load_documents_from_file(file_path) -> list[Document]:
    with open(file_path, 'r') as file:
        serialized_docs = file.read()
    return deserialize_documents(serialized_docs)


# Function to serialize documents to JSON
def serialize_documents(documents):
    serialized_docs = []
    for doc in documents:
        serialized_docs.append({
            "page_content": doc.page_content,
            "metadata": doc.metadata
        })
    return json.dumps(serialized_docs)

# Save the serialized documents to a file
def save_documents_to_file(documents, file_path):
    serialized_docs = serialize_documents(documents)
    with open(file_path, 'w') as file:
        file.write(serialized_docs)