Leopat's picture
upload src files
3b4f6eb verified
raw
history blame
3.05 kB
import re
from tqdm import tqdm
# from llama_index.core import Document
from langchain_core.documents import Document
import json
def remove_boilerplate(document_text: str) -> str:
# Remove the boilerplate text before the actual content
start_content_match = re.search(r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK [^*]+ \*\*\*', document_text)
if start_content_match:
document_text = document_text[start_content_match.end():].strip()
end_content_match = re.search(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK [^*]+ \*\*\*', document_text)
if end_content_match:
document_text = document_text[:end_content_match.start()].strip()
return document_text
def get_metadata_from_text(document_text:str) -> dict[str, str]:
# Extract metadata using regular expressions
title_match = re.search(r'Title\s*:\s*(.*)', document_text)
author_match = re.search(r'Author\s*:\s*(.*)', document_text)
release_date_match = re.search(r'Release date\s*:\s*(.*)', document_text)
language_match = re.search(r'Language\s*:\s*(.*)', document_text)
credits_match = re.search(r'Credits\s*:\s*(.*)', document_text)
# Add metadata to the document
metadata = {}
if title_match:
metadata['Title'] = title_match.group(1).strip()
if author_match:
metadata['Author'] = author_match.group(1).strip()
if release_date_match:
metadata['Release date'] = release_date_match.group(1).strip()
if language_match:
metadata['Language'] = language_match.group(1).strip()
if credits_match:
metadata['Credits'] = credits_match.group(1).strip()
return metadata
def preprocess_documents(documents: list[Document]):
for doc in tqdm(documents):
doc.metadata.update(get_metadata_from_text(doc.page_content))
doc.page_content = remove_boilerplate(doc.page_content)
return documents
# Function to deserialize documents from JSON
def deserialize_documents(serialized_docs):
documents = []
for doc in json.loads(serialized_docs):
documents.append(Document(page_content=doc["page_content"], metadata=doc["metadata"]))
return documents
# Load the documents from a file
def load_documents_from_file(file_path) -> list[Document]:
with open(file_path, 'r') as file:
serialized_docs = file.read()
return deserialize_documents(serialized_docs)
# Function to serialize documents to JSON
def serialize_documents(documents):
serialized_docs = []
for doc in documents:
serialized_docs.append({
"page_content": doc.page_content,
"metadata": doc.metadata
})
return json.dumps(serialized_docs)
# Save the serialized documents to a file
def save_documents_to_file(documents, file_path):
serialized_docs = serialize_documents(documents)
with open(file_path, 'w') as file:
file.write(serialized_docs)