luanpoppe
fix: pegar ids dos documentos
c5586ab
from typing import List, Tuple
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from _utils.splitters.Splitter_class import Splitter
from _utils.LLMs.LLM_class import LLM
from _utils.gerar_relatorio_modelo_usuario.prompts import (
create_prompt_auxiliar_do_contextual_prompt,
)
def gerar_resposta_compilada(serializer):
return {
"num_chunks_retrieval": serializer["num_chunks_retrieval"],
"embedding_weight": serializer["embedding_weight"],
"bm25_weight": serializer["bm25_weight"],
"context_window": serializer["context_window"],
"chunk_overlap": serializer["chunk_overlap"],
"num_k_rerank": serializer["num_k_rerank"],
"model_cohere_rerank": serializer["model_cohere_rerank"],
"more_initial_chunks_for_reranking": serializer[
"more_initial_chunks_for_reranking"
],
"claude_context_model": serializer["claude_context_model"],
"gpt_temperature": serializer["gpt_temperature"],
"user_message": serializer["user_message"],
"model": serializer["model"],
"hf_embedding": serializer["hf_embedding"],
"chunk_size": serializer["chunk_size"],
"chunk_overlap": serializer["chunk_overlap"],
"prompt_auxiliar": serializer["prompt_auxiliar"],
"prompt_gerar_documento": serializer["prompt_gerar_documento"],
}
def validate_many_chunks_in_one_request(
response: str, lista_de_document_ids: List[int]
):
context = (
response.replace("document_id: ", "")
.replace("document_id:", "")
.replace("DOCUMENT_ID: ", "")
.replace("DOCUMENT_ID: ", "")
)
# print("context: ", context)
import re
pattern = (
r"\[([\d.\-]+)\]\s*---\s*\[([^]]+)\]\s*---\s*\[([^]]+)\]\s*</chunk_context>"
)
# pattern = r"\[(\d+|[-.]+)\] --- (.+?) --- (.+?)</chunk_context>" # Funciona para quando a resposta do LLM não vem com "document_id" escrito
matches = re.findall(pattern, context, re.DOTALL)
matches_as_list = []
for index, match in enumerate(list(matches)):
resultado = match[0].replace(".", "").replace("-", "")
resultado = lista_de_document_ids[index]
matches_as_list.append((resultado, match[1], match[2]))
if len(matches) == 0:
return False
return matches_as_list
# Esta função gera a resposta que será usada em cada um das requisições de cada chunk
async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
full_text = ""
for x in full_text_as_array:
full_text += x
prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
print("\n\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
# Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
# response_auxiliar_summary = await aclaude_answer(
# self.claude_client, self.claude_context_model, prompt_auxiliar_summary
# )
llms = LLM()
response_auxiliar_summary = await llms.googleGemini().ainvoke(
[HumanMessage(content=prompt_auxiliar_summary)]
)
print(
"\n\n\n\nresponse_auxiliar_summary.content[0:500]: ",
response_auxiliar_summary.content[0:500],
)
return response_auxiliar_summary.content
async def get_full_text_and_all_PDFs_chunks(
listaPDFs: List[str],
splitterObject: Splitter,
should_use_llama_parse: bool,
isBubble: bool,
):
all_PDFs_chunks = []
pages: List[str] = []
# Load and process document
for pdf_path in listaPDFs:
chunks, pages = await splitterObject.load_and_split_document(
pdf_path, should_use_llama_parse, isBubble
)
all_PDFs_chunks = all_PDFs_chunks + chunks
return all_PDFs_chunks, pages