File size: 6,225 Bytes
1286e81
12d3e1a
 
 
 
 
 
 
1286e81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12d3e1a
1286e81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12d3e1a
1286e81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from typing import List, Dict, Tuple, Optional
from _utils.splitters.Splitter_class import Splitter
from setup.easy_imports import (
    HuggingFaceEmbeddings,
    Chroma,
    ChatOpenAI,
    PromptTemplate,
)
import logging
from cohere import Client
from _utils.models.gerar_relatorio import (
    DocumentChunk,
)


class DocumentSummarizer:
    def __init__(
        self,
        openai_api_key: str,
        cohere_api_key: str,
        embedding_model,
        chunk_size,
        chunk_overlap,
        num_k_rerank,
        model_cohere_rerank,
    ):
        self.openai_api_key = openai_api_key
        self.cohere_client = Client(cohere_api_key)
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        self.num_k_rerank = num_k_rerank
        self.model_cohere_rerank = model_cohere_rerank

        self.splitter = Splitter(chunk_size, chunk_overlap)

    def create_vector_store(
        self, chunks: List[DocumentChunk]
    ) -> Chroma:  # Esta função nunca está sendo utilizada
        """Create vector store with metadata"""
        texts = [chunk.content for chunk in chunks]
        metadatas = [
            {
                "chunk_id": chunk.chunk_id,
                "page": chunk.page_number,
                "start_char": chunk.start_char,
                "end_char": chunk.end_char,
            }
            for chunk in chunks
        ]

        vector_store = Chroma.from_texts(
            texts=texts, metadatas=metadatas, embedding=self.embeddings
        )
        return vector_store

    def rerank_chunks(  # Esta função nunca está sendo utilizada
        self, chunks: List[Dict], query: str, k: int = 5
    ) -> List[Dict]:
        """
        Rerank chunks using Cohere's reranking model.

        Args:
            chunks: List of dictionaries containing chunks and their metadata
            query: Original search query
            k: Number of top chunks to return

        Returns:
            List of reranked chunks with updated relevance scores
        """
        try:
            # Prepare documents for reranking
            documents = [chunk["content"] for chunk in chunks]

            # Get reranking scores from Cohere
            results = self.cohere_client.rerank(
                query=query,
                documents=documents,
                top_n=k,
                model=self.model_cohere_rerank,
            )

            # Create reranked results with original metadata
            reranked_chunks = []
            for hit in results:
                original_chunk = chunks[hit.index]
                reranked_chunks.append(
                    {**original_chunk, "relevance_score": hit.relevance_score}
                )

            return reranked_chunks

        except Exception as e:
            logging.error(f"Reranking failed: {str(e)}")
            return chunks[:k]  # Fallback to original ordering

    def generate_summary_with_sources(  # Esta função nunca está sendo utilizada
        self,
        vector_store: Chroma,
        query: str = "Summarize the main points of this document",
    ) -> List[Dict]:
        """Generate summary with source citations using reranking"""
        # Retrieve more initial chunks for reranking
        relevant_docs = vector_store.similarity_search_with_score(query, k=20)

        # Prepare chunks for reranking
        chunks = []
        for doc, score in relevant_docs:
            chunks.append(
                {
                    "content": doc.page_content,
                    "page": doc.metadata["page"],
                    "chunk_id": doc.metadata["chunk_id"],
                    "relevance_score": score,
                }
            )

        # Rerank chunks
        reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank)

        # Prepare context and sources from reranked chunks
        contexts = []
        sources = []

        for chunk in reranked_chunks:
            contexts.append(chunk["content"])
            sources.append(
                {
                    "content": chunk["content"],
                    "page": chunk["page"],
                    "chunk_id": chunk["chunk_id"],
                    "relevance_score": chunk["relevance_score"],
                }
            )

        prompt_template = """
        Based on the following context, provide multiple key points from the document.
        For each point, create a new paragraph.
        Each paragraph should be a complete, self-contained insight.
        
        Context: {context}
        
        Key points:
        """

        prompt = PromptTemplate(template=prompt_template, input_variables=["context"])

        llm = ChatOpenAI(
            temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
        )

        response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content

        # Split the response into paragraphs
        summaries = [p.strip() for p in response.split("\n\n") if p.strip()]

        # Create structured output
        structured_output = []
        for idx, summary in enumerate(summaries):
            # Associate each summary with the most relevant source
            structured_output.append(
                {
                    "content": summary,
                    "source": {
                        "page": sources[min(idx, len(sources) - 1)]["page"],
                        "text": sources[min(idx, len(sources) - 1)]["content"][:200]
                        + "...",
                        "relevance_score": sources[min(idx, len(sources) - 1)][
                            "relevance_score"
                        ],
                    },
                }
            )

        return structured_output

    def get_source_context(
        self, chunk_id: str, window: int = 100
    ) -> Dict:  # Esta função nunca está sendo utilizada
        """Get extended context around a specific chunk"""
        metadata = self.chunk_metadata.get(chunk_id)
        if not metadata:
            return None

        return {
            "page": metadata["page"],
            "start_char": metadata["start_char"],
            "end_char": metadata["end_char"],
        }