Spaces:

sukhrobnurali
/

financial-document-analyzer

Runtime error

App Files Files Community

financial-document-analyzer / document_processor.py

sukhrobnurali

Bug fixes

76cdde2 about 1 month ago

raw

history blame contribute delete

5.54 kB

	"""
	Document processing with LlamaIndex.
	Handles PDF parsing, indexing, and querying with citation tracking.
	"""

	import os
	import json
	from typing import Dict, Any, List
	from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
	from llama_index.llms.openai import OpenAI
	from llama_index.core.node_parser import SimpleNodeParser
	from llama_index.core.schema import NodeWithScore


	class InvestmentDocumentProcessor:
	"""Process investment documents (PDFs) and extract information with citations."""

	def __init__(self, api_key: str):
	"""Initialize the processor with OpenAI API key."""
	# Configure OpenAI GPT-4o-mini (cheap and fast)
	self.llm = OpenAI(
	model="gpt-4o-mini",
	api_key=api_key,
	temperature=0.1 # Low temperature for factual extraction
	)

	# Set global LLM (embeddings will use OpenAI default)
	Settings.llm = self.llm

	# Node parser to chunk documents while preserving metadata
	# Larger chunks to capture complete financial statements/tables
	self.node_parser = SimpleNodeParser.from_defaults(
	chunk_size=2048,
	chunk_overlap=400
	)

	self.index = None
	self.documents = None

	def load_pdf(self, pdf_path: str) -> None:
	"""Load and index a PDF document."""
	# Load PDF with metadata extraction
	reader = SimpleDirectoryReader(
	input_files=[pdf_path],
	filename_as_id=True
	)

	self.documents = reader.load_data()

	# Add page numbers to metadata if not present
	for doc in self.documents:
	if 'page_label' not in doc.metadata:
	# SimpleDirectoryReader should add page info, but fallback
	doc.metadata['page_label'] = doc.metadata.get('page', 'Unknown')

	# Create vector index
	self.index = VectorStoreIndex.from_documents(
	self.documents,
	node_parser=self.node_parser,
	show_progress=True
	)

	def analyze_with_criteria(self, criteria_prompt: str) -> Dict[str, Any]:
	"""
	Analyze the document against investment criteria.
	Returns analysis with citations.
	"""
	if self.index is None:
	raise ValueError("No document loaded. Call load_pdf() first.")

	# Create query engine with citation tracking
	query_engine = self.index.as_query_engine(
	similarity_top_k=20, # Increased to get more diverse context
	response_mode="compact" # More focused on relevant chunks
	)

	# Query with the criteria prompt
	response = query_engine.query(criteria_prompt)

	# Extract citations from source nodes
	citations = self._extract_citations(response.source_nodes)

	# Parse the response (expecting JSON)
	try:
	analysis_result = json.loads(str(response))
	except json.JSONDecodeError:
	# If not JSON, wrap in a structure
	analysis_result = {
	"raw_response": str(response),
	"parse_error": True
	}

	# Add citations
	analysis_result['citations'] = citations
	analysis_result['source_nodes_count'] = len(response.source_nodes)

	return analysis_result

	def _extract_citations(self, source_nodes: List[NodeWithScore]) -> List[Dict[str, Any]]:
	"""Extract citation information from source nodes."""
	citations = []

	for idx, node in enumerate(source_nodes):
	page = node.node.metadata.get('page_label',
	node.node.metadata.get('page', 'Unknown'))

	citation = {
	"index": idx + 1,
	"page": page,
	"score": node.score,
	"text_preview": node.node.text[:350] + "..." if len(node.node.text) > 350 else node.node.text,
	"full_text": node.node.text,
	"is_truncated": len(node.node.text) > 350,
	"file_name": node.node.metadata.get('file_name', 'Unknown')
	}
	citations.append(citation)

	return citations

	def get_document_summary(self) -> Dict[str, Any]:
	"""Get basic document information."""
	if self.documents is None:
	return {"error": "No document loaded"}

	return {
	"num_pages": len(self.documents),
	"file_name": self.documents[0].metadata.get('file_name', 'Unknown'),
	"total_chars": sum(len(doc.text) for doc in self.documents)
	}

	def quick_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
	"""
	Perform a quick search in the document.
	Useful for finding specific sections or terms.
	"""
	if self.index is None:
	raise ValueError("No document loaded. Call load_pdf() first.")

	query_engine = self.index.as_query_engine(
	similarity_top_k=top_k,
	response_mode="no_text" # Just return nodes, no generation
	)

	response = query_engine.query(query)

	results = []
	for node in response.source_nodes:
	page = node.node.metadata.get('page_label',
	node.node.metadata.get('page', 'Unknown'))
	results.append({
	"page": page,
	"text": node.node.text,
	"score": node.score
	})

	return results