Spaces:
Sleeping
Sleeping
import re | |
from tqdm import tqdm | |
# from llama_index.core import Document | |
from langchain_core.documents import Document | |
import json | |
def remove_boilerplate(document_text: str) -> str: | |
# Remove the boilerplate text before the actual content | |
start_content_match = re.search(r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK [^*]+ \*\*\*', document_text) | |
if start_content_match: | |
document_text = document_text[start_content_match.end():].strip() | |
end_content_match = re.search(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK [^*]+ \*\*\*', document_text) | |
if end_content_match: | |
document_text = document_text[:end_content_match.start()].strip() | |
return document_text | |
def get_metadata_from_text(document_text:str) -> dict[str, str]: | |
# Extract metadata using regular expressions | |
title_match = re.search(r'Title\s*:\s*(.*)', document_text) | |
author_match = re.search(r'Author\s*:\s*(.*)', document_text) | |
release_date_match = re.search(r'Release date\s*:\s*(.*)', document_text) | |
language_match = re.search(r'Language\s*:\s*(.*)', document_text) | |
credits_match = re.search(r'Credits\s*:\s*(.*)', document_text) | |
# Add metadata to the document | |
metadata = {} | |
if title_match: | |
metadata['Title'] = title_match.group(1).strip() | |
if author_match: | |
metadata['Author'] = author_match.group(1).strip() | |
if release_date_match: | |
metadata['Release date'] = release_date_match.group(1).strip() | |
if language_match: | |
metadata['Language'] = language_match.group(1).strip() | |
if credits_match: | |
metadata['Credits'] = credits_match.group(1).strip() | |
return metadata | |
def preprocess_documents(documents: list[Document]): | |
for doc in tqdm(documents): | |
doc.metadata.update(get_metadata_from_text(doc.page_content)) | |
doc.page_content = remove_boilerplate(doc.page_content) | |
return documents | |
# Function to deserialize documents from JSON | |
def deserialize_documents(serialized_docs): | |
documents = [] | |
for doc in json.loads(serialized_docs): | |
documents.append(Document(page_content=doc["page_content"], metadata=doc["metadata"])) | |
return documents | |
# Load the documents from a file | |
def load_documents_from_file(file_path) -> list[Document]: | |
with open(file_path, 'r') as file: | |
serialized_docs = file.read() | |
return deserialize_documents(serialized_docs) | |
# Function to serialize documents to JSON | |
def serialize_documents(documents): | |
serialized_docs = [] | |
for doc in documents: | |
serialized_docs.append({ | |
"page_content": doc.page_content, | |
"metadata": doc.metadata | |
}) | |
return json.dumps(serialized_docs) | |
# Save the serialized documents to a file | |
def save_documents_to_file(documents, file_path): | |
serialized_docs = serialize_documents(documents) | |
with open(file_path, 'w') as file: | |
file.write(serialized_docs) |