Spaces:
Runtime error
Runtime error
| """ | |
| Document processing with LlamaIndex. | |
| Handles PDF parsing, indexing, and querying with citation tracking. | |
| """ | |
| import os | |
| import json | |
| from typing import Dict, Any, List | |
| from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings | |
| from llama_index.llms.openai import OpenAI | |
| from llama_index.core.node_parser import SimpleNodeParser | |
| from llama_index.core.schema import NodeWithScore | |
| class InvestmentDocumentProcessor: | |
| """Process investment documents (PDFs) and extract information with citations.""" | |
| def __init__(self, api_key: str): | |
| """Initialize the processor with OpenAI API key.""" | |
| # Configure OpenAI GPT-4o-mini (cheap and fast) | |
| self.llm = OpenAI( | |
| model="gpt-4o-mini", | |
| api_key=api_key, | |
| temperature=0.1 # Low temperature for factual extraction | |
| ) | |
| # Set global LLM (embeddings will use OpenAI default) | |
| Settings.llm = self.llm | |
| # Node parser to chunk documents while preserving metadata | |
| # Larger chunks to capture complete financial statements/tables | |
| self.node_parser = SimpleNodeParser.from_defaults( | |
| chunk_size=2048, | |
| chunk_overlap=400 | |
| ) | |
| self.index = None | |
| self.documents = None | |
| def load_pdf(self, pdf_path: str) -> None: | |
| """Load and index a PDF document.""" | |
| # Load PDF with metadata extraction | |
| reader = SimpleDirectoryReader( | |
| input_files=[pdf_path], | |
| filename_as_id=True | |
| ) | |
| self.documents = reader.load_data() | |
| # Add page numbers to metadata if not present | |
| for doc in self.documents: | |
| if 'page_label' not in doc.metadata: | |
| # SimpleDirectoryReader should add page info, but fallback | |
| doc.metadata['page_label'] = doc.metadata.get('page', 'Unknown') | |
| # Create vector index | |
| self.index = VectorStoreIndex.from_documents( | |
| self.documents, | |
| node_parser=self.node_parser, | |
| show_progress=True | |
| ) | |
| def analyze_with_criteria(self, criteria_prompt: str) -> Dict[str, Any]: | |
| """ | |
| Analyze the document against investment criteria. | |
| Returns analysis with citations. | |
| """ | |
| if self.index is None: | |
| raise ValueError("No document loaded. Call load_pdf() first.") | |
| # Create query engine with citation tracking | |
| query_engine = self.index.as_query_engine( | |
| similarity_top_k=20, # Increased to get more diverse context | |
| response_mode="compact" # More focused on relevant chunks | |
| ) | |
| # Query with the criteria prompt | |
| response = query_engine.query(criteria_prompt) | |
| # Extract citations from source nodes | |
| citations = self._extract_citations(response.source_nodes) | |
| # Parse the response (expecting JSON) | |
| try: | |
| analysis_result = json.loads(str(response)) | |
| except json.JSONDecodeError: | |
| # If not JSON, wrap in a structure | |
| analysis_result = { | |
| "raw_response": str(response), | |
| "parse_error": True | |
| } | |
| # Add citations | |
| analysis_result['citations'] = citations | |
| analysis_result['source_nodes_count'] = len(response.source_nodes) | |
| return analysis_result | |
| def _extract_citations(self, source_nodes: List[NodeWithScore]) -> List[Dict[str, Any]]: | |
| """Extract citation information from source nodes.""" | |
| citations = [] | |
| for idx, node in enumerate(source_nodes): | |
| page = node.node.metadata.get('page_label', | |
| node.node.metadata.get('page', 'Unknown')) | |
| citation = { | |
| "index": idx + 1, | |
| "page": page, | |
| "score": node.score, | |
| "text_preview": node.node.text[:350] + "..." if len(node.node.text) > 350 else node.node.text, | |
| "full_text": node.node.text, | |
| "is_truncated": len(node.node.text) > 350, | |
| "file_name": node.node.metadata.get('file_name', 'Unknown') | |
| } | |
| citations.append(citation) | |
| return citations | |
| def get_document_summary(self) -> Dict[str, Any]: | |
| """Get basic document information.""" | |
| if self.documents is None: | |
| return {"error": "No document loaded"} | |
| return { | |
| "num_pages": len(self.documents), | |
| "file_name": self.documents[0].metadata.get('file_name', 'Unknown'), | |
| "total_chars": sum(len(doc.text) for doc in self.documents) | |
| } | |
| def quick_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]: | |
| """ | |
| Perform a quick search in the document. | |
| Useful for finding specific sections or terms. | |
| """ | |
| if self.index is None: | |
| raise ValueError("No document loaded. Call load_pdf() first.") | |
| query_engine = self.index.as_query_engine( | |
| similarity_top_k=top_k, | |
| response_mode="no_text" # Just return nodes, no generation | |
| ) | |
| response = query_engine.query(query) | |
| results = [] | |
| for node in response.source_nodes: | |
| page = node.node.metadata.get('page_label', | |
| node.node.metadata.get('page', 'Unknown')) | |
| results.append({ | |
| "page": page, | |
| "text": node.node.text, | |
| "score": node.score | |
| }) | |
| return results | |