Spaces:

Dinesh310
/

demo2

Sleeping

App Files Files Community

demo2 / src /document_ingestion /document_processor.py

Dinesh310

Update src/document_ingestion/document_processor.py

c086254 verified about 1 month ago

raw

history blame contribute delete

4.57 kB

	"""Document processing module for loading and splitting documents"""

	from typing import List, Union
	from langchain_community.document_loaders import WebBaseLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	# from langchain.schema import Document
	from langchain_core.documents import Document
	from pathlib import Path
	from langchain_community.document_loaders import (
	WebBaseLoader,
	PyPDFLoader,
	TextLoader,
	PyPDFDirectoryLoader
	)

	class DocumentProcessor:
	"""Handles document loading and processing"""

	def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
	"""
	Initialize document processor

	Args:
	chunk_size: Size of text chunks
	chunk_overlap: Overlap between chunks
	"""
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)

	def process_pdf(self, file_paths: List[str]):
	"""Load multiple PDFs and return a combined list of chunks"""
	all_documents = []

	for path in file_paths:
	try:
	loader = PyPDFLoader(path)
	# This splits the specific PDF into chunks
	chunks = loader.load_and_split(text_splitter=self.text_splitter)
	# We add those chunks to our master list
	all_documents.extend(chunks)
	except Exception as e:
	print(f"Error loading PDF {path}: {e}")

	return all_documents

	# def process_pdf(self, file_path: str):
	# """Load a PDF from a file path and split into chunks"""
	# try:
	# loader = PyPDFLoader(file_path)
	# # Load and split in one go
	# documents = loader.load_and_split(text_splitter=self.text_splitter)
	# return documents
	# except Exception as e:
	# print(f"Error loading PDF {file_path}: {e}")
	# return []

	def load_from_url(self, url: str) -> List[Document]:
	"""Load document(s) from a URL"""
	loader = WebBaseLoader(url)
	return loader.load()

	def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]:
	"""Load documents from all PDFs inside a directory"""
	loader = PyPDFDirectoryLoader(str(directory))
	return loader.load()

	def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]:
	"""Load document(s) from a TXT file"""
	loader = TextLoader(str(file_path), encoding="utf-8")
	return loader.load()

	def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]:
	"""Load document(s) from a PDF file"""
	loader = PyPDFDirectoryLoader(str("data"))
	return loader.load()

	def load_documents(self, sources: List[str]) -> List[Document]:
	"""
	Load documents from URLs, PDF directories, or TXT files

	Args:
	sources: List of URLs, PDF folder paths, or TXT file paths

	Returns:
	List of loaded documents
	"""
	docs: List[Document] = []
	for src in sources:
	if src.startswith("http://") or src.startswith("https://"):
	docs.extend(self.load_from_url(src))

	path = Path("data")
	if path.is_dir(): # PDF directory
	docs.extend(self.load_from_pdf_dir(path))
	elif path.suffix.lower() == ".txt":
	docs.extend(self.load_from_txt(path))
	else:
	raise ValueError(
	f"Unsupported source type: {src}. "
	"Use URL, .txt file, or PDF directory."
	)
	return docs

	def split_documents(self, documents: List[Document]) -> List[Document]:
	"""
	Split documents into chunks

	Args:
	documents: List of documents to split

	Returns:
	List of split documents
	"""
	return self.splitter.split_documents(documents)

	def process_urls(self, urls: List[str]) -> List[Document]:
	"""
	Complete pipeline to load and split documents

	Args:
	urls: List of URLs to process

	Returns:
	List of processed document chunks
	"""
	docs = self.load_documents(urls)
	return self.split_documents(docs)