Spaces:

alexneakameni
/

medivocate

Running

medivocate / src /vector_store /document_loader.py

Medivocate : An AI-powered platform exploring African history, culture, and traditional medicine, fostering understanding and appreciation of the continent's rich heritage.

15aea1e verified 10 months ago

raw

history blame contribute delete

2.29 kB

	import json
	import os
	from concurrent.futures import ThreadPoolExecutor
	from glob import glob
	from typing import List

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import DirectoryLoader, TextLoader
	from langchain_core.documents import Document
	from tqdm import tqdm


	def sanitize_metadata(metadata: dict) -> dict:
	sanitized = {}
	for key, value in metadata.items():
	if isinstance(value, list):
	sanitized[key] = ", ".join(value)
	elif isinstance(value, (str, int, float, bool)):
	sanitized[key] = value
	else:
	raise ValueError(
	f"Unsupported metadata type for key '{key}': {type(value)}"
	)
	return sanitized


	class DocumentLoader:
	"""
	Handles loading and splitting documents from directories.
	"""

	def __init__(self, docs_dir: str):
	self.docs_dir = docs_dir

	def load_text_documents(self) -> List[Document]:
	"""Loads and splits text documents."""
	loader = DirectoryLoader(self.docs_dir, glob="*/.txt", loader_cls=TextLoader)
	documents = loader.load()
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	return splitter.split_documents(documents)

	def load_json_documents(self) -> List[Document]:
	"""Loads and processes JSON documents."""
	files = glob(os.path.join(self.docs_dir, "*.json"))

	def load_json_file(file_path):
	with open(file_path, "r") as f:
	data = json.load(f)["kwargs"]
	return Document.model_validate(
	{**data, "metadata": sanitize_metadata(data["metadata"])}
	)

	with ThreadPoolExecutor() as executor:
	documents = list(
	tqdm(
	executor.map(load_json_file, files),
	total=len(files),
	desc="Loading JSON documents",
	)
	)

	return documents

	def load_documents(self) -> List[Document]:
	"""Determines and loads documents based on file type."""
	if glob(os.path.join(self.docs_dir, "*.json")):
	return self.load_json_documents()
	return self.load_text_documents()