luanpoppe commited on
Commit
55f46c1
·
1 Parent(s): 131ad89

feat: alterações feitas com peixe em 28/01

Browse files
_antigos/resumos/serializer.py CHANGED
@@ -17,7 +17,7 @@ class ResumoPDFSerializer(serializers.Serializer):
17
  system_prompt = serializers.CharField(required=False)
18
  user_message = serializers.CharField(required=False, default="")
19
  model = serializers.CharField(required=False)
20
- iterative_refinement = serializers.BooleanField(required=False, default=False)
21
 
22
  class ResumoCursorSerializer(serializers.Serializer):
23
  files = serializers.ListField(child=serializers.FileField(), required=True)
 
17
  system_prompt = serializers.CharField(required=False)
18
  user_message = serializers.CharField(required=False, default="")
19
  model = serializers.CharField(required=False)
20
+ iterative_refinement = serializers.BooleanField(required=False, default=False) # type: ignore
21
 
22
  class ResumoCursorSerializer(serializers.Serializer):
23
  files = serializers.ListField(child=serializers.FileField(), required=True)
_utils/bubble_integrations/obter_arquivo.py CHANGED
@@ -1,9 +1,11 @@
1
  # from setup.easy_imports import PyPDFLoader
2
  import os
3
  from langchain_community.document_loaders import PyPDFLoader
4
-
5
  import requests
6
 
 
 
7
  headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
8
 
9
 
@@ -14,10 +16,22 @@ headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
14
  # )
15
 
16
 
17
- def get_pdf_from_bubble(
18
  file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
 
19
  ):
20
-
21
- result = PyPDFLoader(file_url, headers=headers)
22
-
23
- return result.load()
 
 
 
 
 
 
 
 
 
 
 
 
1
  # from setup.easy_imports import PyPDFLoader
2
  import os
3
  from langchain_community.document_loaders import PyPDFLoader
4
+ import tempfile
5
  import requests
6
 
7
+ from _utils.handle_files import return_document_list_with_llama_parser
8
+
9
  headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
10
 
11
 
 
16
  # )
17
 
18
 
19
+ async def get_pdf_from_bubble(
20
  file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
21
+ should_use_llama_parse=False,
22
  ):
23
+ if should_use_llama_parse:
24
+ response = requests.get(file_url, headers=headers)
25
+ response.raise_for_status()
26
+
27
+ # Create a NamedTemporaryFile (with a .pdf suffix) to store the file
28
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
29
+ # Write the response content to the temporary file
30
+ tmp_file.write(response.content)
31
+ return await return_document_list_with_llama_parser(
32
+ tmp_file.name
33
+ ) # por enquanto este arquivo não está sendo excluído
34
+ else:
35
+ result = PyPDFLoader(file_url, headers=headers)
36
+
37
+ return result.load()
_utils/gerar_relatorio_modelo_usuario/EnhancedDocumentSummarizer.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
  from typing import List, Dict, Tuple, Optional
 
 
3
  from _utils.vector_stores.Vector_store_class import VectorStore
4
  from setup.easy_imports import (
5
  Chroma,
@@ -25,11 +27,11 @@ from asgiref.sync import sync_to_async
25
 
26
 
27
  class EnhancedDocumentSummarizer(DocumentSummarizer):
 
 
28
 
29
  def __init__(
30
  self,
31
- openai_api_key: str,
32
- claude_api_key: str,
33
  config: RetrievalConfig,
34
  embedding_model,
35
  chunk_size,
@@ -45,8 +47,8 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
45
  reciprocal_rank_fusion,
46
  ):
47
  super().__init__(
48
- openai_api_key,
49
- os.environ.get("COHERE_API_KEY"),
50
  embedding_model,
51
  chunk_size,
52
  chunk_overlap,
@@ -55,7 +57,7 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
55
  )
56
  self.config = config
57
  self.contextual_retriever = ContextualRetriever(
58
- config, claude_api_key, claude_context_model
59
  )
60
  self.logger = logging.getLogger(__name__)
61
  self.prompt_auxiliar = prompt_auxiliar
@@ -208,8 +210,8 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
208
 
209
  llm = ChatOpenAI(
210
  temperature=self.gpt_temperature,
211
- model_name=self.gpt_model,
212
- api_key=self.openai_api_key,
213
  )
214
 
215
  prompt_auxiliar = PromptTemplate(
 
1
  import os
2
  from typing import List, Dict, Tuple, Optional
3
+
4
+ from pydantic import SecretStr
5
  from _utils.vector_stores.Vector_store_class import VectorStore
6
  from setup.easy_imports import (
7
  Chroma,
 
27
 
28
 
29
  class EnhancedDocumentSummarizer(DocumentSummarizer):
30
+ openai_api_key = os.environ.get("OPENAI_API_KEY", "")
31
+ claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
32
 
33
  def __init__(
34
  self,
 
 
35
  config: RetrievalConfig,
36
  embedding_model,
37
  chunk_size,
 
47
  reciprocal_rank_fusion,
48
  ):
49
  super().__init__(
50
+ self.openai_api_key,
51
+ os.environ.get("COHERE_API_KEY", ""),
52
  embedding_model,
53
  chunk_size,
54
  chunk_overlap,
 
57
  )
58
  self.config = config
59
  self.contextual_retriever = ContextualRetriever(
60
+ config, self.claude_api_key, claude_context_model
61
  )
62
  self.logger = logging.getLogger(__name__)
63
  self.prompt_auxiliar = prompt_auxiliar
 
210
 
211
  llm = ChatOpenAI(
212
  temperature=self.gpt_temperature,
213
+ model=self.gpt_model,
214
+ api_key=SecretStr(self.openai_api_key),
215
  )
216
 
217
  prompt_auxiliar = PromptTemplate(
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED
@@ -165,6 +165,9 @@ class ContextualRetriever:
165
  ]
166
  # print("\n\nresult", result)
167
 
 
 
 
168
  lista_chunks = []
169
  for index, chunk in enumerate(chunks):
170
  lista_chunks.append(
@@ -259,6 +262,7 @@ async def get_full_text_and_all_PDFs_chunks(
259
  listaPDFs: List[str],
260
  splitterObject: Splitter,
261
  should_use_llama_parse: bool,
 
262
  ):
263
  all_PDFs_chunks = []
264
 
@@ -266,10 +270,14 @@ async def get_full_text_and_all_PDFs_chunks(
266
 
267
  # Load and process document
268
  for pdf_path in listaPDFs:
269
- if should_use_llama_parse:
270
- pages = pages + await return_document_list_with_llama_parser(pdf_path)
271
  else:
272
- pages = pages + get_pdf_from_bubble(pdf_path)
 
 
 
 
273
  chunks = splitterObject.load_and_split_document(
274
  pdf_path, pages, should_use_llama_parse
275
  )
 
165
  ]
166
  # print("\n\nresult", result)
167
 
168
+ if result == "" or result == [""]:
169
+ print("\n\ncontext", context)
170
+
171
  lista_chunks = []
172
  for index, chunk in enumerate(chunks):
173
  lista_chunks.append(
 
262
  listaPDFs: List[str],
263
  splitterObject: Splitter,
264
  should_use_llama_parse: bool,
265
+ isBubble: bool,
266
  ):
267
  all_PDFs_chunks = []
268
 
 
270
 
271
  # Load and process document
272
  for pdf_path in listaPDFs:
273
+ if isBubble:
274
+ pages = pages + await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
275
  else:
276
+ if should_use_llama_parse:
277
+ pages = pages + await return_document_list_with_llama_parser(pdf_path)
278
+ else:
279
+ pages = pages + PyPDFLoader(pdf_path).load()
280
+
281
  chunks = splitterObject.load_and_split_document(
282
  pdf_path, pages, should_use_llama_parse
283
  )
_utils/gerar_relatorio_modelo_usuario/prompts.py CHANGED
@@ -180,10 +180,10 @@ Here are the 20 chunks to analyze:
180
  </user_input>
181
  Example output structure (do not copy the content, only the format):
182
  1. <chunk_context>
183
- [document_id] --- [document_title] --- [brief_context_for_the_chunk]
184
  </chunk_context>
185
- 2.<chunk_context>
186
- [document_id] --- [document_title] --- [brief_context_for_the_chunk]
187
  </chunk_context>
188
  [Continue for all 20 chunks]
189
  Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
 
180
  </user_input>
181
  Example output structure (do not copy the content, only the format):
182
  1. <chunk_context>
183
+ [128372552] --- [Certidão de Ocorrência] --- [This chunk concludes the victim's statement, confirming the details of the robbery and his signature on the report. It aligns with the summary's focus on the victim's testimony as a key piece of evidence in the case.]
184
  </chunk_context>
185
+ 2. <chunk_context>
186
+ [158665972] --- [Certidão de Antecedentes] --- [This chunk includes the certification by the police clerk, Anatalino Lima Farias, confirming the accuracy of the transcribed report. It supports the summary's reliance on official police records to document the crime and its investigation.]
187
  </chunk_context>
188
  [Continue for all 20 chunks]
189
  Please provide context for all 20 chunks, following this structure. It's OK for this section to be quite long.
_utils/handle_files.py CHANGED
@@ -2,7 +2,6 @@ import tempfile, os
2
  from typing import List
3
  from langchain_core.documents import Document as LangchainDocument
4
  from llama_index import Document
5
- from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
6
  from llama_parse import LlamaParse, ResultType
7
 
8
 
 
2
  from typing import List
3
  from langchain_core.documents import Document as LangchainDocument
4
  from llama_index import Document
 
5
  from llama_parse import LlamaParse, ResultType
6
 
7
 
_utils/resumo_completo_cursor.py CHANGED
@@ -38,7 +38,9 @@ os.environ.get("LANGCHAIN_API_KEY")
38
  os.environ["LANGCHAIN_PROJECT"] = "VELLA"
39
 
40
 
41
- async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
 
 
42
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
43
  # Configuration
44
  config = RetrievalConfig(
@@ -51,8 +53,6 @@ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
51
 
52
  # Initialize enhanced summarizer
53
  summarizer = EnhancedDocumentSummarizer(
54
- openai_api_key=os.environ.get("OPENAI_API_KEY"),
55
- claude_api_key=os.environ.get("CLAUDE_API_KEY"),
56
  config=config,
57
  embedding_model=serializer["hf_embedding"],
58
  chunk_overlap=serializer["chunk_overlap"],
@@ -69,7 +69,7 @@ async def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs=None):
69
  )
70
 
71
  allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks(
72
- listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"]
73
  )
74
 
75
  chunks_passados, is_contextualized_chunk = (
 
38
  os.environ["LANGCHAIN_PROJECT"] = "VELLA"
39
 
40
 
41
+ async def get_llm_summary_answer_by_cursor_complete(
42
+ serializer, listaPDFs=None, isBubble=False
43
+ ):
44
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
45
  # Configuration
46
  config = RetrievalConfig(
 
53
 
54
  # Initialize enhanced summarizer
55
  summarizer = EnhancedDocumentSummarizer(
 
 
56
  config=config,
57
  embedding_model=serializer["hf_embedding"],
58
  chunk_overlap=serializer["chunk_overlap"],
 
69
  )
70
 
71
  allPdfsChunks, pages = await get_full_text_and_all_PDFs_chunks(
72
+ listaPDFs, summarizer.splitter, serializer["should_use_llama_parse"], isBubble
73
  )
74
 
75
  chunks_passados, is_contextualized_chunk = (
_utils/vector_stores/Vector_store_class.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from typing import List, Dict, Tuple, Optional
2
  from _utils.models.gerar_relatorio import (
3
  ContextualizedChunk,
@@ -19,26 +20,38 @@ class VectorStore:
19
  try:
20
  # Prepare texts with context
21
  if is_contextualized_chunk:
22
- texts = [f"{chunk.context} {chunk.content}" for chunk in chunks]
 
 
 
23
  else:
24
  texts = [f"{chunk.content}" for chunk in chunks]
25
 
26
  # Create vector store
27
  metadatas = []
28
- for chunk in chunks:
29
  if is_contextualized_chunk:
30
- context = chunk.context
 
 
 
 
 
 
 
 
 
31
  else:
32
- context = ""
33
- metadatas.append(
34
- {
35
- "chunk_id": chunk.chunk_id,
36
- "page": chunk.page_number,
37
- "start_char": chunk.start_char,
38
- "end_char": chunk.end_char,
39
- "context": context,
40
- }
41
- )
42
 
43
  vector_store = Chroma.from_texts(
44
  texts=texts, metadatas=metadatas, embedding=self.embeddings
 
1
+ from pydoc import text
2
  from typing import List, Dict, Tuple, Optional
3
  from _utils.models.gerar_relatorio import (
4
  ContextualizedChunk,
 
20
  try:
21
  # Prepare texts with context
22
  if is_contextualized_chunk:
23
+ texts = [
24
+ f"Document_id: {chunk.chunk_id}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
25
+ for chunk in chunks
26
+ ]
27
  else:
28
  texts = [f"{chunk.content}" for chunk in chunks]
29
 
30
  # Create vector store
31
  metadatas = []
32
+ for index, chunk in enumerate(chunks):
33
  if is_contextualized_chunk:
34
+ context = texts[index]
35
+ metadatas.append(
36
+ {
37
+ "chunk_id": chunk.chunk_id,
38
+ "page": chunk.page_number,
39
+ "start_char": chunk.start_char,
40
+ "end_char": chunk.end_char,
41
+ "context": context,
42
+ }
43
+ )
44
  else:
45
+ context = texts[index]
46
+ metadatas.append(
47
+ {
48
+ "chunk_id": chunk.chunk_id,
49
+ "page": chunk.page_number,
50
+ "start_char": chunk.start_char,
51
+ "end_char": chunk.end_char,
52
+ "context": context,
53
+ }
54
+ )
55
 
56
  vector_store = Chroma.from_texts(
57
  texts=texts, metadatas=metadatas, embedding=self.embeddings
gerar_documento/serializer.py CHANGED
@@ -29,16 +29,16 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
29
  required=False, default=prompt_gerar_documento
30
  )
31
  user_message = serializers.CharField(required=False, default=user_message)
32
- num_chunks_retrieval = serializers.IntegerField(default=5)
33
  embedding_weight = serializers.FloatField(default=0.5)
34
  bm25_weight = serializers.FloatField(default=0.5)
35
  context_window = serializers.IntegerField(default=3)
36
  chunk_overlap = serializers.IntegerField(default=200)
37
- num_k_rerank = serializers.IntegerField(default=5)
38
  model_cohere_rerank = serializers.CharField(
39
  required=False, default="rerank-english-v2.0"
40
  )
41
- more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
42
  claude_context_model = serializers.CharField(
43
  required=False, default="claude-3-haiku-20240307"
44
  )
@@ -57,16 +57,16 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
57
  required=False, default=prompt_gerar_documento
58
  )
59
  user_message = serializers.CharField(required=False, default=user_message)
60
- num_chunks_retrieval = serializers.IntegerField(default=5)
61
  embedding_weight = serializers.FloatField(default=0.5)
62
  bm25_weight = serializers.FloatField(default=0.5)
63
  context_window = serializers.IntegerField(default=3)
64
  chunk_overlap = serializers.IntegerField(default=200)
65
- num_k_rerank = serializers.IntegerField(default=5)
66
  model_cohere_rerank = serializers.CharField(
67
  required=False, default="rerank-english-v2.0"
68
  )
69
- more_initial_chunks_for_reranking = serializers.IntegerField(default=20)
70
  claude_context_model = serializers.CharField(
71
  required=False, default="claude-3-haiku-20240307"
72
  )
 
29
  required=False, default=prompt_gerar_documento
30
  )
31
  user_message = serializers.CharField(required=False, default=user_message)
32
+ num_chunks_retrieval = serializers.IntegerField(default=20)
33
  embedding_weight = serializers.FloatField(default=0.5)
34
  bm25_weight = serializers.FloatField(default=0.5)
35
  context_window = serializers.IntegerField(default=3)
36
  chunk_overlap = serializers.IntegerField(default=200)
37
+ num_k_rerank = serializers.IntegerField(default=20)
38
  model_cohere_rerank = serializers.CharField(
39
  required=False, default="rerank-english-v2.0"
40
  )
41
+ more_initial_chunks_for_reranking = serializers.IntegerField(default=100)
42
  claude_context_model = serializers.CharField(
43
  required=False, default="claude-3-haiku-20240307"
44
  )
 
57
  required=False, default=prompt_gerar_documento
58
  )
59
  user_message = serializers.CharField(required=False, default=user_message)
60
+ num_chunks_retrieval = serializers.IntegerField(default=20)
61
  embedding_weight = serializers.FloatField(default=0.5)
62
  bm25_weight = serializers.FloatField(default=0.5)
63
  context_window = serializers.IntegerField(default=3)
64
  chunk_overlap = serializers.IntegerField(default=200)
65
+ num_k_rerank = serializers.IntegerField(default=20)
66
  model_cohere_rerank = serializers.CharField(
67
  required=False, default="rerank-english-v2.0"
68
  )
69
+ more_initial_chunks_for_reranking = serializers.IntegerField(default=100)
70
  claude_context_model = serializers.CharField(
71
  required=False, default="claude-3-haiku-20240307"
72
  )
gerar_documento/views.py CHANGED
@@ -43,7 +43,7 @@ class GerarDocumentoView(AsyncAPIView):
43
  print("\n\nlistaPDFs: ", listaPDFs)
44
 
45
  resposta_llm = await get_llm_summary_answer_by_cursor_complete(
46
- data, listaPDFs
47
  )
48
  print("\n\nresposta_llm: ", resposta_llm)
49
 
 
43
  print("\n\nlistaPDFs: ", listaPDFs)
44
 
45
  resposta_llm = await get_llm_summary_answer_by_cursor_complete(
46
+ data, listaPDFs, True
47
  )
48
  print("\n\nresposta_llm: ", resposta_llm)
49
 
setup/environment.py CHANGED
@@ -3,3 +3,4 @@ default_model = "gpt-4o-mini"
3
 
4
  api_url = "https://luanpoppe-vella-backend.hf.space"
5
  # api_url = "http://localhost:8000"
 
 
3
 
4
  api_url = "https://luanpoppe-vella-backend.hf.space"
5
  # api_url = "http://localhost:8000"
6
+