Spaces:

RAMESH143code
/

Vectorless-RAG-Document-QA

Sleeping

App Files Files Community

Vectorless-RAG-Document-QA / src /__init__.py

RAMESH143code

Upload 12 files

4d8a2c2 verified 10 days ago

raw

history blame contribute delete

5.73 kB

	"""
	Vectorless RAG on Structured PDFs
	──────────────────────────────────
	Public package exports for clean imports when using as a library.

	Supports both text-based and scanned PDFs with OCR.

	Usage:
	from src import PDFParser, ParsedDocument
	from src import StructuredChunker, BM25Retriever
	from src import VectorlessRAGPipeline
	"""

	# Core PDF Parser (with OCR support)
	from .pdf_parser import (
	PDFParser,
	ParsedDocument,
	ParsedPage,
	TextBlock,
	Heading,
	TableData,
	DocumentMetadata,
	# OCR Presets and Utilities
	OCR_PRESETS,
	OCR_LANGUAGES,
	TextCleaner,
	)

	# Chunking Module
	try:
	from .chunker import (
	StructuredChunker,
	Chunk,
	ChunkType,
	)
	except ImportError:
	# Fallback if chunker doesn't exist yet
	import warnings
	warnings.warn("chunker module not found. Install or create chunker.py")
	StructuredChunker = None
	Chunk = None
	ChunkType = None

	# BM25 Retriever Module
	try:
	from .retriever import (
	BM25Retriever,
	RetrievalResult,
	)
	except ImportError:
	# Fallback if retriever doesn't exist yet
	import warnings
	warnings.warn("retriever module not found. Install or create retriever.py")
	BM25Retriever = None
	RetrievalResult = None

	# RAG Pipeline Module
	try:
	from .rag_pipeline import (
	VectorlessRAGPipeline,
	RAGResponse,
	Citation,
	)
	except ImportError:
	# Fallback if rag_pipeline doesn't exist yet
	import warnings
	warnings.warn("rag_pipeline module not found. Install or create rag_pipeline.py")
	VectorlessRAGPipeline = None
	RAGResponse = None
	Citation = None

	# Version information
	__version__ = "2.0.0"
	__author__ = "Vectorless RAG Team"
	__description__ = "PDF parsing with OCR support for scanned documents"

	# Convenience function to create a configured parser
	def create_parser(ocr_quality: str = "BALANCED",
	ocr_language: str = "eng",
	parallel_processing: bool = True,
	max_workers: int = 4) -> PDFParser:
	"""
	Create a configured PDF parser with OCR settings.

	Args:
	ocr_quality: "FAST", "BALANCED", "HIGH_QUALITY", "VERY_HIGH", "MAXIMUM"
	ocr_language: OCR language (e.g., "eng", "eng+fra", "hin+eng")
	parallel_processing: Enable parallel OCR processing
	max_workers: Number of parallel workers

	Returns:
	Configured PDFParser instance
	"""
	return PDFParser(
	ocr_quality=ocr_quality,
	ocr_language=ocr_language,
	parallel_processing=parallel_processing,
	max_workers=max_workers
	)


	# List available OCR languages
	def list_ocr_languages():
	"""
	Print available OCR languages.
	"""
	print("\n🌐 Available OCR Languages:")
	print("-" * 40)
	for code, name in OCR_LANGUAGES.items():
	print(f" {code:10} - {name}")
	print("\n💡 Use '+' for multiple languages: eng+fra+deu")


	# List OCR quality presets
	def list_quality_presets():
	"""
	Print available OCR quality presets.
	"""
	print("\n📷 OCR Quality Presets:")
	print("-" * 50)
	for preset, config in OCR_PRESETS.items():
	print(f" {preset:12} - {config['description']}")
	print(f" DPI: {config['dpi']}, Timeout: {config['timeout']}s")
	print("\n💡 Higher quality = slower processing")


	# Module info
	def info():
	"""
	Print module information and capabilities.
	"""
	print("=" * 60)
	print("Vectorless RAG - PDF Processing Module")
	print("=" * 60)
	print(f"Version: {__version__}")
	print(f"Description: {__description__}")
	print("\n✨ Features:")
	print(" • Text-based PDF extraction (fast)")
	print(" • Scanned PDF OCR (automatic fallback)")
	print(" • Multi-language OCR support")
	print(" • Configurable quality presets")
	print(" • Parallel processing for speed")
	print(" • Heading and table detection")
	print("\n🔧 OCR Configuration:")
	print(f" • Tesseract path: Configured")
	print(f" • Available languages: {len(OCR_LANGUAGES)}")
	print(f" • Quality presets: {len(OCR_PRESETS)}")
	print("=" * 60)


	# Define what gets imported with "from src import *"
	__all__ = [
	# Core Parser
	"PDFParser",
	"ParsedDocument",
	"ParsedPage",
	"TextBlock",
	"Heading",
	"TableData",
	"DocumentMetadata",

	# OCR Utilities
	"OCR_PRESETS",
	"OCR_LANGUAGES",
	"TextCleaner",

	# Chunker (if available)
	"StructuredChunker",
	"Chunk",
	"ChunkType",

	# Retriever (if available)
	"BM25Retriever",
	"RetrievalResult",

	# Pipeline (if available)
	"VectorlessRAGPipeline",
	"RAGResponse",
	"Citation",

	# Convenience functions
	"create_parser",
	"list_ocr_languages",
	"list_quality_presets",
	"info",

	# Version
	"__version__",
	"__author__",
	"__description__",
	]

	# Check OCR availability on import
	try:
	import pytesseract
	import fitz
	OCR_READY = True
	except ImportError as e:
	OCR_READY = False
	import warnings
	warnings.warn(f"OCR dependencies not fully installed: {e}. Scanned PDFs will not work.")

	# Print status message (optional, comment out if not needed)
	if OCR_READY:
	print("✅ Vectorless RAG module loaded (OCR ready)")
	else:
	print("⚠️ Vectorless RAG module loaded (OCR not available)")