Spaces:

gourisankar85
/

TalkToYourDocument

Running

File size: 4,823 Bytes

import logging
import os
from typing import Any, Dict, List
import uuid
from data.document_loader import DocumentLoader
from data.pdf_reader import PDFReader
from retriever.chunk_documents import chunk_documents
from retriever.vector_store_manager import VectorStoreManager

class DocumentManager:
    def __init__(self):
        self.doc_loader = DocumentLoader()
        self.pdf_reader = PDFReader()
        self.vector_manager = VectorStoreManager()
        self.uploaded_documents = {}
        self.chunked_documents = {}
        self.document_ids = {}
        logging.info("DocumentManager initialized")

    def process_document(self, file):
        """

        Process an uploaded file: load, read PDF, chunk, and store in vector store.

        Returns: (status_message, page_list, filename, doc_id)

        """
        try:
            if file is None:
                return "No file uploaded", None, None

            logging.info(f"Processing file: {file}")

            # Load and validate file
            file_path = self.doc_loader.load_file(file)
            filename = os.path.basename(file_path)

            # Read PDF content
            page_list = self.pdf_reader.read_pdf(file_path)

            # Store the uploaded document
            self.uploaded_documents[filename] = file_path

            # Generate a unique document ID
            doc_id = str(uuid.uuid4())
            self.document_ids[filename] = doc_id

            # Chunk the pages
            chunks = chunk_documents(page_list, doc_id, chunk_size=2000, chunk_overlap=300)
            self.chunked_documents[filename] = chunks

            # Add chunks to vector store
            self.vector_manager.add_documents(chunks)

            return (
                f"Successfully loaded {filename} with {len(page_list)} pages",
                filename,
                doc_id
            )

        except Exception as e:
            logging.error(f"Error processing document: {str(e)}")
            return f"Error: {str(e)}", [], None, None

    def get_uploaded_documents(self):
        """Return the list of uploaded document filenames."""
        return list(self.uploaded_documents.keys())

    def get_chunks(self, filename):
        """Return chunks for a given filename."""
        return self.chunked_documents.get(filename, [])

    def get_document_id(self, filename):
        """Return the document ID for a given filename."""
        return self.document_ids.get(filename, None)
    
    def retrieve_top_k(self, query: str, selected_docs: List[str], k: int = 5) -> List[Dict[str, Any]]:
        """

        Retrieve the top K chunks across the selected documents based on the user's query.



        Args:

            query (str): The user's query.

            selected_docs (List[str]): List of selected document filenames from the dropdown.

            k (int): Number of top results to return (default is 5).



        Returns:

            List[Dict[str, Any]]: List of top K chunks with their text, metadata, and scores.

        """
        if not selected_docs:
            logging.warning("No documents selected for retrieval")
            return []

        all_results = []
        for filename in selected_docs:
            doc_id = self.get_document_id(filename)
            if not doc_id:
                logging.warning(f"No document ID found for filename: {filename}")
                continue

            # Search for relevant chunks within this document
            results = self.vector_manager.search(query, doc_id, k=k)
            all_results.extend(results)

        # Sort all results by score in descending order and take the top K
        all_results.sort(key=lambda x: x['score'], reverse=True)
        top_k_results = all_results[:k]

        # Log the list of retrieved documents
        #logging.info(f"Result from search :{all_results} ")
        logging.info(f"Retrieved top {k} documents:")
        for i, result in enumerate(top_k_results, 1):
            doc_id = result['metadata'].get('doc_id', 'Unknown')
            filename = next((name for name, d_id in self.document_ids.items() if d_id == doc_id), 'Unknown')
            logging.info(f"{i}. Filename: {filename}, Doc ID: {doc_id}, Score: {result['score']:.4f}, Text: {result['text'][:200]}...")

        return top_k_results
    
    def retrieve_summary_chunks(self, query: str, doc_id : str, k: int = 10):
        logging.info(f"Retrieving {k} chunks for summary: {query}, Document Id: {doc_id}")
        results = self.vector_manager.search(query, doc_id, k=k)
        top_k_results = results[:k]
        logging.info(f"Retrieved {len(top_k_results)} chunks for summary")
        
        return top_k_results