Spaces:
Running
Running
luanpoppe
commited on
Commit
·
55f46c1
1
Parent(s):
131ad89
feat: alterações feitas com peixe em 28/01
Browse files- _antigos/resumos/serializer.py +1 -1
- _utils/bubble_integrations/obter_arquivo.py +20 -6
- _utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py +9 -7
- _utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +11 -3
- _utils/gerar_relatorio_modelo_usuario/prompts.py +3 -3
- _utils/handle_files.py +0 -1
- _utils/resumo_completo_cursor.py +4 -4
- _utils/vector_stores/Vector_store_class.py +26 -13
- gerar_documento/serializer.py +6 -6
- gerar_documento/views.py +1 -1
- setup/environment.py +1 -0
_antigos/resumos/serializer.py
CHANGED
@@ -17,7 +17,7 @@ class ResumoPDFSerializer(serializers.Serializer):
|
|
17 |
system_prompt = serializers.CharField(required=False)
|
18 |
user_message = serializers.CharField(required=False, default="")
|
19 |
model = serializers.CharField(required=False)
|
20 |
-
iterative_refinement = serializers.BooleanField(required=False, default=False)
|
21 |
|
22 |
class ResumoCursorSerializer(serializers.Serializer):
|
23 |
files = serializers.ListField(child=serializers.FileField(), required=True)
|
|
|
17 |
system_prompt = serializers.CharField(required=False)
|
18 |
user_message = serializers.CharField(required=False, default="")
|
19 |
model = serializers.CharField(required=False)
|
20 |
+
iterative_refinement = serializers.BooleanField(required=False, default=False) # type: ignore
|
21 |
|
22 |
class ResumoCursorSerializer(serializers.Serializer):
|
23 |
files = serializers.ListField(child=serializers.FileField(), required=True)
|
_utils/bubble_integrations/obter_arquivo.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
# from setup.easy_imports import PyPDFLoader
|
2 |
import os
|
3 |
from langchain_community.document_loaders import PyPDFLoader
|
4 |
-
|
5 |
import requests
|
6 |
|
|
|
|
|
7 |
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
|
8 |
|
9 |
|
@@ -14,10 +16,22 @@ headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
|
|
14 |
# )
|
15 |
|
16 |
|
17 |
-
def get_pdf_from_bubble(
|
18 |
file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
|
|
|
19 |
):
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# from setup.easy_imports import PyPDFLoader
|
2 |
import os
|
3 |
from langchain_community.document_loaders import PyPDFLoader
|
4 |
+
import tempfile
|
5 |
import requests
|
6 |
|
7 |
+
from _utils.handle_files import return_document_list_with_llama_parser
|
8 |
+
|
9 |
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
|
10 |
|
11 |
|
|
|
16 |
# )
|
17 |
|
18 |
|
19 |
+
async def get_pdf_from_bubble(
|
20 |
file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
|
21 |
+
should_use_llama_parse=False,
|
22 |
):
|
23 |
+
if should_use_llama_parse:
|
24 |
+
response = requests.get(file_url, headers=headers)
|
25 |
+
response.raise_for_status()
|
26 |
+
|
27 |
+
# Create a NamedTemporaryFile (with a .pdf suffix) to store the file
|
28 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
29 |
+
# Write the response content to the temporary file
|
30 |
+
tmp_file.write(response.content)
|
31 |
+
return await return_document_list_with_llama_parser(
|
32 |
+
tmp_file.name
|
33 |
+
) # por enquanto este arquivo não está sendo excluído
|
34 |
+
else:
|
35 |
+
result = PyPDFLoader(file_url, headers=headers)
|
36 |
+
|
37 |
+
return result.load()
|
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import os
|
2 |
from typing import List, Dict, Tuple, Optional
|
|
|
|
|
3 |
from _utils.vector_stores.Vector_store_class import VectorStore
|
4 |
from setup.easy_imports import (
|
5 |
Chroma,
|
@@ -25,11 +27,11 @@ from asgiref.sync import sync_to_async
|
|
25 |
|
26 |
|
27 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
|
|
|
28 |
|
29 |
def __init__(
|
30 |
self,
|
31 |
-
openai_api_key: str,
|
32 |
-
claude_api_key: str,
|
33 |
config: RetrievalConfig,
|
34 |
embedding_model,
|
35 |
chunk_size,
|
@@ -45,8 +47,8 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
45 |
reciprocal_rank_fusion,
|
46 |
):
|
47 |
super().__init__(
|
48 |
-
openai_api_key,
|
49 |
-
os.environ.get("COHERE_API_KEY"),
|
50 |
embedding_model,
|
51 |
chunk_size,
|
52 |
chunk_overlap,
|
@@ -55,7 +57,7 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
55 |
)
|
56 |
self.config = config
|
57 |
self.contextual_retriever = ContextualRetriever(
|
58 |
-
config, claude_api_key, claude_context_model
|
59 |
)
|
60 |
self.logger = logging.getLogger(__name__)
|
61 |
self.prompt_auxiliar = prompt_auxiliar
|
@@ -208,8 +210,8 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
|
|
208 |
|
209 |
llm = ChatOpenAI(
|
210 |
temperature=self.gpt_temperature,
|
211 |
-
|
212 |
-
api_key=self.openai_api_key,
|
213 |
)
|
214 |
|
215 |
prompt_auxiliar = PromptTemplate(
|
|
|
1 |
import os
|
2 |
from typing import List, Dict, Tuple, Optional
|
3 |
+
|
4 |
+
from pydantic import SecretStr
|
5 |
from _utils.vector_stores.Vector_store_class import VectorStore
|
6 |
from setup.easy_imports import (
|
7 |
Chroma,
|
|
|
27 |
|
28 |
|
29 |
class EnhancedDocumentSummarizer(DocumentSummarizer):
|
30 |
+
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
31 |
+
claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
|
32 |
|
33 |
def __init__(
|
34 |
self,
|
|
|
|
|
35 |
config: RetrievalConfig,
|
36 |
embedding_model,
|
37 |
chunk_size,
|
|
|
47 |
reciprocal_rank_fusion,
|
48 |
):
|
49 |
super().__init__(
|
50 |
+
self.openai_api_key,
|
51 |
+
os.environ.get("COHERE_API_KEY", ""),
|
52 |
embedding_model,
|
53 |
chunk_size,
|
54 |
chunk_overlap,
|
|
|
57 |
)
|
58 |
self.config = config
|
59 |
self.contextual_retriever = ContextualRetriever(
|
60 |
+
config, self.claude_api_key, claude_context_model
|
61 |
)
|
62 |
self.logger = logging.getLogger(__name__)
|
63 |
self.prompt_auxiliar = prompt_auxiliar
|
|
|
210 |
|
211 |
llm = ChatOpenAI(
|
212 |
temperature=self.gpt_temperature,
|
213 |
+
model=self.gpt_model,
|
214 |
+
api_key=SecretStr(self.openai_api_key),
|
215 |
)
|
216 |
|
217 |
prompt_auxiliar = PromptTemplate(
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
@@ -165,6 +165,9 @@ class ContextualRetriever:
|
|
165 |
]
|
166 |
# print("\n\nresult", result)
|
167 |
|
|
|
|
|
|
|
168 |
lista_chunks = []
|
169 |
for index, chunk in enumerate(chunks):
|
170 |
lista_chunks.append(
|
@@ -259,6 +262,7 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
259 |
listaPDFs: List[str],
|
260 |
splitterObject: Splitter,
|
261 |
should_use_llama_parse: bool,
|
|
|
262 |
):
|
263 |
all_PDFs_chunks = []
|
264 |
|
@@ -266,10 +270,14 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
266 |
|
267 |
# Load and process document
|
268 |
for pdf_path in listaPDFs:
|
269 |
-
if
|
270 |
-
pages = pages + await
|
271 |
else:
|
272 |
-
|
|
|
|
|
|
|
|
|
273 |
chunks = splitterObject.load_and_split_document(
|
274 |
pdf_path, pages, should_use_llama_parse
|
275 |
)
|
|
|
165 |
]
|
166 |
# print("\n\nresult", result)
|
167 |
|
168 |
+
if result == "" or result == [""]:
|
169 |
+
print("\n\ncontext", context)
|
170 |
+
|
171 |
lista_chunks = []
|
172 |
for index, chunk in enumerate(chunks):
|
173 |
lista_chunks.append(
|
|
|
262 |
listaPDFs: List[str],
|
263 |
splitterObject: Splitter,
|
264 |
should_use_llama_parse: bool,
|
265 |
+
isBubble: bool,
|
266 |
):
|
267 |
all_PDFs_chunks = []
|
268 |
|
|
|
270 |
|
271 |
# Load and process document
|
272 |
for pdf_path in listaPDFs:
|
273 |
+
if isBubble:
|
274 |
+
pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
|
275 |
else:
|
276 |
+
if should_use_llama_parse:
|
277 |
+
pages = pages + await return_document_list_with_llama_parser(pdf_path)
|
278 |
+
else:
|
279 |
+
pages = pages + PyPDFLoader(pdf_path).load()
|
280 |
+
|
281 |
chunks = splitterObject.load_and_split_document(
|
282 |
pdf_path, pages, should_use_llama_parse
|
283 |
)
|
_utils/gerar_relatorio_modelo_usuario/prompts.py
CHANGED
@@ -180,10 +180,10 @@ Here are the 20 chunks to analyze:
|
|
180 |
</user_input>
|
181 |
Example output structure (do not copy the content, only the format):
|
182 |
1. <chunk_context>
|
183 |
-
[
|
184 |
</chunk_context>
|
185 |
-
2
|
186 |
-
[
|
187 |
</chunk_context>
|
188 |
[Continue for all 20 chunks]
|
189 |
Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
|
|
|
180 |
</user_input>
|
181 |
Example output structure (do not copy the content, only the format):
|
182 |
1. <chunk_context>
|
183 |
+
[128372552] --- [Certidão de Ocorrência] --- [This chunk concludes the victim's statement, confirming the details of the robbery and his signature on the report. It aligns with the summary's focus on the victim's testimony as a key piece of evidence in the case.]
|
184 |
</chunk_context>
|
185 |
+
2. <chunk_context>
|
186 |
+
[158665972] --- [Certidão de Antecedentes] --- [This chunk includes the certification by the police clerk, Anatalino Lima Farias, confirming the accuracy of the transcribed report. It supports the summary's reliance on official police records to document the crime and its investigation.]
|
187 |
</chunk_context>
|
188 |
[Continue for all 20 chunks]
|
189 |
Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
|
_utils/handle_files.py
CHANGED
@@ -2,7 +2,6 @@ import tempfile, os
|
|
2 |
from typing import List
|
3 |
from langchain_core.documents import Document as LangchainDocument
|
4 |
from llama_index import Document
|
5 |
-
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
6 |
from llama_parse import LlamaParse, ResultType
|
7 |
|
8 |
|
|
|
2 |
from typing import List
|
3 |
from langchain_core.documents import Document as LangchainDocument
|
4 |
from llama_index import Document
|
|
|
5 |
from llama_parse import LlamaParse, ResultType
|
6 |
|
7 |
|
_utils/resumo_completo_cursor.py
CHANGED
@@ -38,7 +38,9 @@ os.environ.get("LANGCHAIN_API_KEY")
|
|
38 |
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
|
39 |
|
40 |
|
41 |
-
async def get_llm_summary_answer_by_cursor_complete(
|
|
|
|
|
42 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
43 |
# Configuration
|
44 |
config = RetrievalConfig(
|
@@ -51,8 +53,6 @@ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
|
|
51 |
|
52 |
# Initialize enhanced summarizer
|
53 |
summarizer = EnhancedDocumentSummarizer(
|
54 |
-
openai_api_key=os.environ.get("OPENAI_API_KEY"),
|
55 |
-
claude_api_key=os.environ.get("CLAUDE_API_KEY"),
|
56 |
config=config,
|
57 |
embedding_model=serializer["hf_embedding"],
|
58 |
chunk_overlap=serializer["chunk_overlap"],
|
@@ -69,7 +69,7 @@ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
|
|
69 |
)
|
70 |
|
71 |
allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks(
|
72 |
-
listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"]
|
73 |
)
|
74 |
|
75 |
chunks_passados, is_contextualized_chunk = (
|
|
|
38 |
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
|
39 |
|
40 |
|
41 |
+
async def get_llm_summary_answer_by_cursor_complete(
|
42 |
+
serializer, listaPDFs=None, isBubble=False
|
43 |
+
):
|
44 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
45 |
# Configuration
|
46 |
config = RetrievalConfig(
|
|
|
53 |
|
54 |
# Initialize enhanced summarizer
|
55 |
summarizer = EnhancedDocumentSummarizer(
|
|
|
|
|
56 |
config=config,
|
57 |
embedding_model=serializer["hf_embedding"],
|
58 |
chunk_overlap=serializer["chunk_overlap"],
|
|
|
69 |
)
|
70 |
|
71 |
allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks(
|
72 |
+
listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"], isBubble
|
73 |
)
|
74 |
|
75 |
chunks_passados, is_contextualized_chunk = (
|
_utils/vector_stores/Vector_store_class.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from typing import List, Dict, Tuple, Optional
|
2 |
from _utils.models.gerar_relatorio import (
|
3 |
ContextualizedChunk,
|
@@ -19,26 +20,38 @@ class VectorStore:
|
|
19 |
try:
|
20 |
# Prepare texts with context
|
21 |
if is_contextualized_chunk:
|
22 |
-
texts = [
|
|
|
|
|
|
|
23 |
else:
|
24 |
texts = [f"{chunk.content}" for chunk in chunks]
|
25 |
|
26 |
# Create vector store
|
27 |
metadatas = []
|
28 |
-
for chunk in chunks:
|
29 |
if is_contextualized_chunk:
|
30 |
-
context =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
else:
|
32 |
-
context =
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
|
43 |
vector_store = Chroma.from_texts(
|
44 |
texts=texts, metadatas=metadatas, embedding=self.embeddings
|
|
|
1 |
+
from pydoc import text
|
2 |
from typing import List, Dict, Tuple, Optional
|
3 |
from _utils.models.gerar_relatorio import (
|
4 |
ContextualizedChunk,
|
|
|
20 |
try:
|
21 |
# Prepare texts with context
|
22 |
if is_contextualized_chunk:
|
23 |
+
texts = [
|
24 |
+
f"Document_id: {chunk.chunk_id}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
|
25 |
+
for chunk in chunks
|
26 |
+
]
|
27 |
else:
|
28 |
texts = [f"{chunk.content}" for chunk in chunks]
|
29 |
|
30 |
# Create vector store
|
31 |
metadatas = []
|
32 |
+
for index, chunk in enumerate(chunks):
|
33 |
if is_contextualized_chunk:
|
34 |
+
context = texts[index]
|
35 |
+
metadatas.append(
|
36 |
+
{
|
37 |
+
"chunk_id": chunk.chunk_id,
|
38 |
+
"page": chunk.page_number,
|
39 |
+
"start_char": chunk.start_char,
|
40 |
+
"end_char": chunk.end_char,
|
41 |
+
"context": context,
|
42 |
+
}
|
43 |
+
)
|
44 |
else:
|
45 |
+
context = texts[index]
|
46 |
+
metadatas.append(
|
47 |
+
{
|
48 |
+
"chunk_id": chunk.chunk_id,
|
49 |
+
"page": chunk.page_number,
|
50 |
+
"start_char": chunk.start_char,
|
51 |
+
"end_char": chunk.end_char,
|
52 |
+
"context": context,
|
53 |
+
}
|
54 |
+
)
|
55 |
|
56 |
vector_store = Chroma.from_texts(
|
57 |
texts=texts, metadatas=metadatas, embedding=self.embeddings
|
gerar_documento/serializer.py
CHANGED
@@ -29,16 +29,16 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
|
|
29 |
required=False, default=prompt_gerar_documento
|
30 |
)
|
31 |
user_message = serializers.CharField(required=False, default=user_message)
|
32 |
-
num_chunks_retrieval = serializers.IntegerField(default=
|
33 |
embedding_weight = serializers.FloatField(default=0.5)
|
34 |
bm25_weight = serializers.FloatField(default=0.5)
|
35 |
context_window = serializers.IntegerField(default=3)
|
36 |
chunk_overlap = serializers.IntegerField(default=200)
|
37 |
-
num_k_rerank = serializers.IntegerField(default=
|
38 |
model_cohere_rerank = serializers.CharField(
|
39 |
required=False, default="rerank-english-v2.0"
|
40 |
)
|
41 |
-
more_initial_chunks_for_reranking = serializers.IntegerField(default=
|
42 |
claude_context_model = serializers.CharField(
|
43 |
required=False, default="claude-3-haiku-20240307"
|
44 |
)
|
@@ -57,16 +57,16 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
|
57 |
required=False, default=prompt_gerar_documento
|
58 |
)
|
59 |
user_message = serializers.CharField(required=False, default=user_message)
|
60 |
-
num_chunks_retrieval = serializers.IntegerField(default=
|
61 |
embedding_weight = serializers.FloatField(default=0.5)
|
62 |
bm25_weight = serializers.FloatField(default=0.5)
|
63 |
context_window = serializers.IntegerField(default=3)
|
64 |
chunk_overlap = serializers.IntegerField(default=200)
|
65 |
-
num_k_rerank = serializers.IntegerField(default=
|
66 |
model_cohere_rerank = serializers.CharField(
|
67 |
required=False, default="rerank-english-v2.0"
|
68 |
)
|
69 |
-
more_initial_chunks_for_reranking = serializers.IntegerField(default=
|
70 |
claude_context_model = serializers.CharField(
|
71 |
required=False, default="claude-3-haiku-20240307"
|
72 |
)
|
|
|
29 |
required=False, default=prompt_gerar_documento
|
30 |
)
|
31 |
user_message = serializers.CharField(required=False, default=user_message)
|
32 |
+
num_chunks_retrieval = serializers.IntegerField(default=20)
|
33 |
embedding_weight = serializers.FloatField(default=0.5)
|
34 |
bm25_weight = serializers.FloatField(default=0.5)
|
35 |
context_window = serializers.IntegerField(default=3)
|
36 |
chunk_overlap = serializers.IntegerField(default=200)
|
37 |
+
num_k_rerank = serializers.IntegerField(default=20)
|
38 |
model_cohere_rerank = serializers.CharField(
|
39 |
required=False, default="rerank-english-v2.0"
|
40 |
)
|
41 |
+
more_initial_chunks_for_reranking = serializers.IntegerField(default=100)
|
42 |
claude_context_model = serializers.CharField(
|
43 |
required=False, default="claude-3-haiku-20240307"
|
44 |
)
|
|
|
57 |
required=False, default=prompt_gerar_documento
|
58 |
)
|
59 |
user_message = serializers.CharField(required=False, default=user_message)
|
60 |
+
num_chunks_retrieval = serializers.IntegerField(default=20)
|
61 |
embedding_weight = serializers.FloatField(default=0.5)
|
62 |
bm25_weight = serializers.FloatField(default=0.5)
|
63 |
context_window = serializers.IntegerField(default=3)
|
64 |
chunk_overlap = serializers.IntegerField(default=200)
|
65 |
+
num_k_rerank = serializers.IntegerField(default=20)
|
66 |
model_cohere_rerank = serializers.CharField(
|
67 |
required=False, default="rerank-english-v2.0"
|
68 |
)
|
69 |
+
more_initial_chunks_for_reranking = serializers.IntegerField(default=100)
|
70 |
claude_context_model = serializers.CharField(
|
71 |
required=False, default="claude-3-haiku-20240307"
|
72 |
)
|
gerar_documento/views.py
CHANGED
@@ -43,7 +43,7 @@ class GerarDocumentoView(AsyncAPIView):
|
|
43 |
print("\n\nlistaPDFs: ", listaPDFs)
|
44 |
|
45 |
resposta_llm = await get_llm_summary_answer_by_cursor_complete(
|
46 |
-
data, listaPDFs
|
47 |
)
|
48 |
print("\n\nresposta_llm: ", resposta_llm)
|
49 |
|
|
|
43 |
print("\n\nlistaPDFs: ", listaPDFs)
|
44 |
|
45 |
resposta_llm = await get_llm_summary_answer_by_cursor_complete(
|
46 |
+
data, listaPDFs, True
|
47 |
)
|
48 |
print("\n\nresposta_llm: ", resposta_llm)
|
49 |
|
setup/environment.py
CHANGED
@@ -3,3 +3,4 @@ default_model = "gpt-4o-mini"
|
|
3 |
|
4 |
api_url = "https://luanpoppe-vella-backend.hf.space"
|
5 |
# api_url = "http://localhost:8000"
|
|
|
|
3 |
|
4 |
api_url = "https://luanpoppe-vella-backend.hf.space"
|
5 |
# api_url = "http://localhost:8000"
|
6 |
+
|