File size: 4,823 Bytes
e6cc6f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
010d51d
e6cc6f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ecc9585
e6cc6f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
010d51d
c737bb6
e6cc6f7
 
 
c737bb6
e6cc6f7
010d51d
 
 
 
 
 
 
 
e6cc6f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import logging
import os
from typing import Any, Dict, List
import uuid
from data.document_loader import DocumentLoader
from data.pdf_reader import PDFReader
from retriever.chunk_documents import chunk_documents
from retriever.vector_store_manager import VectorStoreManager

class DocumentManager:
    def __init__(self):
        self.doc_loader = DocumentLoader()
        self.pdf_reader = PDFReader()
        self.vector_manager = VectorStoreManager()
        self.uploaded_documents = {}
        self.chunked_documents = {}
        self.document_ids = {}
        logging.info("DocumentManager initialized")

    def process_document(self, file):
        """

        Process an uploaded file: load, read PDF, chunk, and store in vector store.

        Returns: (status_message, page_list, filename, doc_id)

        """
        try:
            if file is None:
                return "No file uploaded", None, None

            logging.info(f"Processing file: {file}")

            # Load and validate file
            file_path = self.doc_loader.load_file(file)
            filename = os.path.basename(file_path)

            # Read PDF content
            page_list = self.pdf_reader.read_pdf(file_path)

            # Store the uploaded document
            self.uploaded_documents[filename] = file_path

            # Generate a unique document ID
            doc_id = str(uuid.uuid4())
            self.document_ids[filename] = doc_id

            # Chunk the pages
            chunks = chunk_documents(page_list, doc_id, chunk_size=2000, chunk_overlap=300)
            self.chunked_documents[filename] = chunks

            # Add chunks to vector store
            self.vector_manager.add_documents(chunks)

            return (
                f"Successfully loaded {filename} with {len(page_list)} pages",
                filename,
                doc_id
            )

        except Exception as e:
            logging.error(f"Error processing document: {str(e)}")
            return f"Error: {str(e)}", [], None, None

    def get_uploaded_documents(self):
        """Return the list of uploaded document filenames."""
        return list(self.uploaded_documents.keys())

    def get_chunks(self, filename):
        """Return chunks for a given filename."""
        return self.chunked_documents.get(filename, [])

    def get_document_id(self, filename):
        """Return the document ID for a given filename."""
        return self.document_ids.get(filename, None)
    
    def retrieve_top_k(self, query: str, selected_docs: List[str], k: int = 5) -> List[Dict[str, Any]]:
        """

        Retrieve the top K chunks across the selected documents based on the user's query.



        Args:

            query (str): The user's query.

            selected_docs (List[str]): List of selected document filenames from the dropdown.

            k (int): Number of top results to return (default is 5).



        Returns:

            List[Dict[str, Any]]: List of top K chunks with their text, metadata, and scores.

        """
        if not selected_docs:
            logging.warning("No documents selected for retrieval")
            return []

        all_results = []
        for filename in selected_docs:
            doc_id = self.get_document_id(filename)
            if not doc_id:
                logging.warning(f"No document ID found for filename: {filename}")
                continue

            # Search for relevant chunks within this document
            results = self.vector_manager.search(query, doc_id, k=k)
            all_results.extend(results)

        # Sort all results by score in descending order and take the top K
        all_results.sort(key=lambda x: x['score'], reverse=True)
        top_k_results = all_results[:k]

        # Log the list of retrieved documents
        #logging.info(f"Result from search :{all_results} ")
        logging.info(f"Retrieved top {k} documents:")
        for i, result in enumerate(top_k_results, 1):
            doc_id = result['metadata'].get('doc_id', 'Unknown')
            filename = next((name for name, d_id in self.document_ids.items() if d_id == doc_id), 'Unknown')
            logging.info(f"{i}. Filename: {filename}, Doc ID: {doc_id}, Score: {result['score']:.4f}, Text: {result['text'][:200]}...")

        return top_k_results
    
    def retrieve_summary_chunks(self, query: str, doc_id : str, k: int = 10):
        logging.info(f"Retrieving {k} chunks for summary: {query}, Document Id: {doc_id}")
        results = self.vector_manager.search(query, doc_id, k=k)
        top_k_results = results[:k]
        logging.info(f"Retrieved {len(top_k_results)} chunks for summary")
        
        return top_k_results