Spaces:

nivakaran
/

NewFreeRag

Sleeping

NewFreeRag / src /document_loader /loader.py

GitHub Actions

Deploy from GitHub Actions

7c6afad about 2 months ago

4.68 kB

	"""Document loader for various file formats."""

	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import List, Optional, Dict, Any


	@dataclass
	class Document:
	"""Represents a loaded document."""
	content: str
	metadata: Dict[str, Any] = field(default_factory=dict)

	@property
	def source(self) -> str:
	"""Get document source path."""
	return self.metadata.get("source", "unknown")


	class DocumentLoader:
	"""Load documents from various file formats."""

	SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"}

	def __init__(self):
	"""Initialize the document loader."""
	self._pdf_loader = None
	self._docx_loader = None

	def load_file(self, file_path: str) -> Document:
	"""Load a single file.

	Args:
	file_path: Path to the file.

	Returns:
	Loaded document.

	Raises:
	ValueError: If file format is not supported.
	FileNotFoundError: If file doesn't exist.
	"""
	path = Path(file_path)

	if not path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	extension = path.suffix.lower()

	if extension not in self.SUPPORTED_EXTENSIONS:
	raise ValueError(
	f"Unsupported file format: {extension}. "
	f"Supported: {self.SUPPORTED_EXTENSIONS}"
	)

	content = self._load_by_extension(path, extension)

	return Document(
	content=content,
	metadata={
	"source": str(path.absolute()),
	"filename": path.name,
	"extension": extension
	}
	)

	def load_directory(
	self,
	directory_path: str,
	recursive: bool = True
	) -> List[Document]:
	"""Load all supported files from a directory.

	Args:
	directory_path: Path to the directory.
	recursive: Whether to search recursively.

	Returns:
	List of loaded documents.
	"""
	path = Path(directory_path)

	if not path.exists():
	raise FileNotFoundError(f"Directory not found: {directory_path}")

	if not path.is_dir():
	raise ValueError(f"Not a directory: {directory_path}")

	documents = []
	pattern = "*/" if recursive else "*"

	for file_path in path.glob(pattern):
	if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
	try:
	doc = self.load_file(str(file_path))
	documents.append(doc)
	print(f"Loaded: {file_path.name}")
	except Exception as e:
	print(f"Warning: Failed to load {file_path.name}: {e}")

	return documents

	def _load_by_extension(self, path: Path, extension: str) -> str:
	"""Load file content based on extension.

	Args:
	path: File path.
	extension: File extension.

	Returns:
	File content as string.
	"""
	if extension in {".txt", ".md"}:
	return self._load_text(path)
	elif extension == ".pdf":
	return self._load_pdf(path)
	elif extension == ".docx":
	return self._load_docx(path)
	else:
	raise ValueError(f"Unknown extension: {extension}")

	def _load_text(self, path: Path) -> str:
	"""Load plain text file."""
	return path.read_text(encoding="utf-8")

	def _load_pdf(self, path: Path) -> str:
	"""Load PDF file."""
	try:
	from pypdf import PdfReader
	except ImportError:
	raise ImportError("pypdf is required for PDF files: pip install pypdf")

	reader = PdfReader(str(path))
	text_parts = []

	for page in reader.pages:
	text = page.extract_text()
	if text:
	text_parts.append(text)

	return "\n\n".join(text_parts)

	def _load_docx(self, path: Path) -> str:
	"""Load DOCX file."""
	try:
	from docx import Document as DocxDocument
	except ImportError:
	raise ImportError("python-docx is required for DOCX files: pip install python-docx")

	doc = DocxDocument(str(path))
	paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
	return "\n\n".join(paragraphs)