Spaces:

luanpoppe
/

vella-backend

Running

File size: 6,225 Bytes

from typing import List, Dict, Tuple, Optional
from _utils.splitters.Splitter_class import Splitter
from setup.easy_imports import (
    HuggingFaceEmbeddings,
    Chroma,
    ChatOpenAI,
    PromptTemplate,
)
import logging
from cohere import Client
from _utils.models.gerar_relatorio import (
    DocumentChunk,
)


class DocumentSummarizer:
    def __init__(
        self,
        openai_api_key: str,
        cohere_api_key: str,
        embedding_model,
        chunk_size,
        chunk_overlap,
        num_k_rerank,
        model_cohere_rerank,
    ):
        self.openai_api_key = openai_api_key
        self.cohere_client = Client(cohere_api_key)
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        self.num_k_rerank = num_k_rerank
        self.model_cohere_rerank = model_cohere_rerank

        self.splitter = Splitter(chunk_size, chunk_overlap)

    def create_vector_store(
        self, chunks: List[DocumentChunk]
    ) -> Chroma:  # Esta função nunca está sendo utilizada
        """Create vector store with metadata"""
        texts = [chunk.content for chunk in chunks]
        metadatas = [
            {
                "chunk_id": chunk.chunk_id,
                "page": chunk.page_number,
                "start_char": chunk.start_char,
                "end_char": chunk.end_char,
            }
            for chunk in chunks
        ]

        vector_store = Chroma.from_texts(
            texts=texts, metadatas=metadatas, embedding=self.embeddings
        )
        return vector_store

    def rerank_chunks(  # Esta função nunca está sendo utilizada
        self, chunks: List[Dict], query: str, k: int = 5
    ) -> List[Dict]:
        """
        Rerank chunks using Cohere's reranking model.

        Args:
            chunks: List of dictionaries containing chunks and their metadata
            query: Original search query
            k: Number of top chunks to return

        Returns:
            List of reranked chunks with updated relevance scores
        """
        try:
            # Prepare documents for reranking
            documents = [chunk["content"] for chunk in chunks]

            # Get reranking scores from Cohere
            results = self.cohere_client.rerank(
                query=query,
                documents=documents,
                top_n=k,
                model=self.model_cohere_rerank,
            )

            # Create reranked results with original metadata
            reranked_chunks = []
            for hit in results:
                original_chunk = chunks[hit.index]
                reranked_chunks.append(
                    {**original_chunk, "relevance_score": hit.relevance_score}
                )

            return reranked_chunks

        except Exception as e:
            logging.error(f"Reranking failed: {str(e)}")
            return chunks[:k]  # Fallback to original ordering

    def generate_summary_with_sources(  # Esta função nunca está sendo utilizada
        self,
        vector_store: Chroma,
        query: str = "Summarize the main points of this document",
    ) -> List[Dict]:
        """Generate summary with source citations using reranking"""
        # Retrieve more initial chunks for reranking
        relevant_docs = vector_store.similarity_search_with_score(query, k=20)

        # Prepare chunks for reranking
        chunks = []
        for doc, score in relevant_docs:
            chunks.append(
                {
                    "content": doc.page_content,
                    "page": doc.metadata["page"],
                    "chunk_id": doc.metadata["chunk_id"],
                    "relevance_score": score,
                }
            )

        # Rerank chunks
        reranked_chunks = self.rerank_chunks(chunks, query, k=self.num_k_rerank)

        # Prepare context and sources from reranked chunks
        contexts = []
        sources = []

        for chunk in reranked_chunks:
            contexts.append(chunk["content"])
            sources.append(
                {
                    "content": chunk["content"],
                    "page": chunk["page"],
                    "chunk_id": chunk["chunk_id"],
                    "relevance_score": chunk["relevance_score"],
                }
            )

        prompt_template = """
        Based on the following context, provide multiple key points from the document.
        For each point, create a new paragraph.
        Each paragraph should be a complete, self-contained insight.
        
        Context: {context}
        
        Key points:
        """

        prompt = PromptTemplate(template=prompt_template, input_variables=["context"])

        llm = ChatOpenAI(
            temperature=0, model_name="gpt-4o-mini", api_key=self.openai_api_key
        )

        response = llm.invoke(prompt.format(context="\n\n".join(contexts))).content

        # Split the response into paragraphs
        summaries = [p.strip() for p in response.split("\n\n") if p.strip()]

        # Create structured output
        structured_output = []
        for idx, summary in enumerate(summaries):
            # Associate each summary with the most relevant source
            structured_output.append(
                {
                    "content": summary,
                    "source": {
                        "page": sources[min(idx, len(sources) - 1)]["page"],
                        "text": sources[min(idx, len(sources) - 1)]["content"][:200]
                        + "...",
                        "relevance_score": sources[min(idx, len(sources) - 1)][
                            "relevance_score"
                        ],
                    },
                }
            )

        return structured_output

    def get_source_context(
        self, chunk_id: str, window: int = 100
    ) -> Dict:  # Esta função nunca está sendo utilizada
        """Get extended context around a specific chunk"""
        metadata = self.chunk_metadata.get(chunk_id)
        if not metadata:
            return None

        return {
            "page": metadata["page"],
            "start_char": metadata["start_char"],
            "end_char": metadata["end_char"],
        }