import re from tqdm import tqdm # from llama_index.core import Document from langchain_core.documents import Document import json def remove_boilerplate(document_text: str) -> str: # Remove the boilerplate text before the actual content start_content_match = re.search(r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK [^*]+ \*\*\*', document_text) if start_content_match: document_text = document_text[start_content_match.end():].strip() end_content_match = re.search(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK [^*]+ \*\*\*', document_text) if end_content_match: document_text = document_text[:end_content_match.start()].strip() return document_text def get_metadata_from_text(document_text:str) -> dict[str, str]: # Extract metadata using regular expressions title_match = re.search(r'Title\s*:\s*(.*)', document_text) author_match = re.search(r'Author\s*:\s*(.*)', document_text) release_date_match = re.search(r'Release date\s*:\s*(.*)', document_text) language_match = re.search(r'Language\s*:\s*(.*)', document_text) credits_match = re.search(r'Credits\s*:\s*(.*)', document_text) # Add metadata to the document metadata = {} if title_match: metadata['Title'] = title_match.group(1).strip() if author_match: metadata['Author'] = author_match.group(1).strip() if release_date_match: metadata['Release date'] = release_date_match.group(1).strip() if language_match: metadata['Language'] = language_match.group(1).strip() if credits_match: metadata['Credits'] = credits_match.group(1).strip() return metadata def preprocess_documents(documents: list[Document]): for doc in tqdm(documents): doc.metadata.update(get_metadata_from_text(doc.page_content)) doc.page_content = remove_boilerplate(doc.page_content) return documents # Function to deserialize documents from JSON def deserialize_documents(serialized_docs): documents = [] for doc in json.loads(serialized_docs): documents.append(Document(page_content=doc["page_content"], metadata=doc["metadata"])) return documents # Load the documents from a file def load_documents_from_file(file_path) -> list[Document]: with open(file_path, 'r') as file: serialized_docs = file.read() return deserialize_documents(serialized_docs) # Function to serialize documents to JSON def serialize_documents(documents): serialized_docs = [] for doc in documents: serialized_docs.append({ "page_content": doc.page_content, "metadata": doc.metadata }) return json.dumps(serialized_docs) # Save the serialized documents to a file def save_documents_to_file(documents, file_path): serialized_docs = serialize_documents(documents) with open(file_path, 'w') as file: file.write(serialized_docs)