Spaces:

Leopat
/

thesis_chat_with_history_books

Sleeping

App Files Files

thesis_chat_with_history_books / finalthesis /preprocessing.py

Leopat

upload src files

3b4f6eb verified 11 months ago

raw

history blame

3.05 kB

	import re
	from tqdm import tqdm
	# from llama_index.core import Document
	from langchain_core.documents import Document
	import json


	def remove_boilerplate(document_text: str) -> str:

	# Remove the boilerplate text before the actual content
	start_content_match = re.search(r'\\\* START OF THE PROJECT GUTENBERG EBOOK [^]+ \\\', document_text)
	if start_content_match:
	document_text = document_text[start_content_match.end():].strip()

	end_content_match = re.search(r'\\\* END OF THE PROJECT GUTENBERG EBOOK [^]+ \\\', document_text)
	if end_content_match:
	document_text = document_text[:end_content_match.start()].strip()
	return document_text




	def get_metadata_from_text(document_text:str) -> dict[str, str]:

	# Extract metadata using regular expressions
	title_match = re.search(r'Title\s:\s(.*)', document_text)
	author_match = re.search(r'Author\s:\s(.*)', document_text)
	release_date_match = re.search(r'Release date\s:\s(.*)', document_text)
	language_match = re.search(r'Language\s:\s(.*)', document_text)
	credits_match = re.search(r'Credits\s:\s(.*)', document_text)

	# Add metadata to the document
	metadata = {}
	if title_match:
	metadata['Title'] = title_match.group(1).strip()
	if author_match:
	metadata['Author'] = author_match.group(1).strip()
	if release_date_match:
	metadata['Release date'] = release_date_match.group(1).strip()
	if language_match:
	metadata['Language'] = language_match.group(1).strip()
	if credits_match:
	metadata['Credits'] = credits_match.group(1).strip()

	return metadata



	def preprocess_documents(documents: list[Document]):

	for doc in tqdm(documents):
	doc.metadata.update(get_metadata_from_text(doc.page_content))
	doc.page_content = remove_boilerplate(doc.page_content)

	return documents



	# Function to deserialize documents from JSON
	def deserialize_documents(serialized_docs):
	documents = []
	for doc in json.loads(serialized_docs):
	documents.append(Document(page_content=doc["page_content"], metadata=doc["metadata"]))
	return documents

	# Load the documents from a file
	def load_documents_from_file(file_path) -> list[Document]:
	with open(file_path, 'r') as file:
	serialized_docs = file.read()
	return deserialize_documents(serialized_docs)


	# Function to serialize documents to JSON
	def serialize_documents(documents):
	serialized_docs = []
	for doc in documents:
	serialized_docs.append({
	"page_content": doc.page_content,
	"metadata": doc.metadata
	})
	return json.dumps(serialized_docs)

	# Save the serialized documents to a file
	def save_documents_to_file(documents, file_path):
	serialized_docs = serialize_documents(documents)
	with open(file_path, 'w') as file:
	file.write(serialized_docs)