davidoneilai commited on
Commit
8514dc9
1 Parent(s): 4db208a

retriever funcionando e novo banco de questoes

Browse files
.dockerignore CHANGED
@@ -41,4 +41,5 @@ next-env.d.ts
41
  .yarn
42
 
43
  *venv
44
- como_nao_errar.txt
 
 
41
  .yarn
42
 
43
  *venv
44
+ como_nao_errar.txt
45
+ server/venv
.gitignore CHANGED
@@ -28,3 +28,5 @@ dist-ssr
28
 
29
  *.env
30
  *chroma_db
 
 
 
28
 
29
  *.env
30
  *chroma_db
31
+ como_nao_errar.txt
32
+ server/venv
server/app.py CHANGED
@@ -3,11 +3,14 @@ from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from fastapi.staticfiles import StaticFiles
5
  from services.generate_questions_service import GenerateQuestionsService
 
6
  # from data.load_data import retriever_pre
7
 
8
  generate_questions_service = GenerateQuestionsService()
9
 
 
10
  class Body(BaseModel):
 
11
  subject: str
12
  difficultie: str
13
 
@@ -24,9 +27,10 @@ app.add_middleware(
24
 
25
  @app.post("/generate_questions")
26
  async def generate_questions(body: Body):
 
27
  subject = body.subject
28
  difficultie = body.difficultie
29
- query = f"Quero que você gere questões de biologia, sendo do assunto: {subject} e sendo da dificuldade: {difficultie}."
30
  res = generate_questions_service.handle(f"""{query}""")
31
  return res
32
 
 
3
  from pydantic import BaseModel
4
  from fastapi.staticfiles import StaticFiles
5
  from services.generate_questions_service import GenerateQuestionsService
6
+
7
  # from data.load_data import retriever_pre
8
 
9
  generate_questions_service = GenerateQuestionsService()
10
 
11
+
12
  class Body(BaseModel):
13
+ school_subject: str
14
  subject: str
15
  difficultie: str
16
 
 
27
 
28
  @app.post("/generate_questions")
29
  async def generate_questions(body: Body):
30
+ school_subject = body.school_subject
31
  subject = body.subject
32
  difficultie = body.difficultie
33
+ query = f"Quero que você gere questões de {school_subject}, sendo do assunto: {subject} e sendo da dificuldade: {difficultie}."
34
  res = generate_questions_service.handle(f"""{query}""")
35
  return res
36
 
server/data/retriever.py CHANGED
@@ -3,9 +3,11 @@ from langchain_community.document_loaders import TextLoader
3
  from langchain.vectorstores import Chroma
4
  from langchain.chains.query_constructor.base import AttributeInfo
5
  from langchain.retrievers.self_query.base import SelfQueryRetriever
 
6
  from llm.gemini import Gemini
7
  from utils.questions_parser import parse_question
8
 
 
9
  class Retriever:
10
 
11
  _model = Gemini()
@@ -17,21 +19,14 @@ class Retriever:
17
 
18
  DATA_PATH = os.environ["DATA_PATH"]
19
 
20
- self.data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load()
21
-
22
- self.questions = list(
23
- map(lambda x: "##Questão" + x, self.data_loader[0].page_content.split("##Questão"))
24
- )
25
-
26
- self.docs = []
27
 
28
- for question in self.questions:
29
- try:
30
- self.docs.append(parse_question(question))
31
- except Exception as e:
32
- print(e, question)
33
 
34
- self.vectorstore = Chroma.from_documents(self.docs, self._model.embeddings, persist_directory="./chroma_db")
 
 
35
 
36
  self.metadata_field_info = [
37
  AttributeInfo(
@@ -58,6 +53,14 @@ class Retriever:
58
 
59
  document_content_description = "Questões de matérias do ensino médio."
60
 
 
 
61
  self.retriever = SelfQueryRetriever.from_llm(
62
- self._model.llm, self.vectorstore, document_content_description, self.metadata_field_info, verbose=True
 
 
 
 
63
  )
 
 
 
3
  from langchain.vectorstores import Chroma
4
  from langchain.chains.query_constructor.base import AttributeInfo
5
  from langchain.retrievers.self_query.base import SelfQueryRetriever
6
+ from langchain_text_splitters import CharacterTextSplitter
7
  from llm.gemini import Gemini
8
  from utils.questions_parser import parse_question
9
 
10
+
11
  class Retriever:
12
 
13
  _model = Gemini()
 
19
 
20
  DATA_PATH = os.environ["DATA_PATH"]
21
 
22
+ data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load()
 
 
 
 
 
 
23
 
24
+ text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
25
+ docs = text_splitter.split_documents(data_loader)
 
 
 
26
 
27
+ self.vectorstore = Chroma.from_documents(
28
+ docs, self._model.embeddings, persist_directory="./chroma_db"
29
+ )
30
 
31
  self.metadata_field_info = [
32
  AttributeInfo(
 
53
 
54
  document_content_description = "Questões de matérias do ensino médio."
55
 
56
+ db = Chroma.from_documents(docs, self._model.embeddings)
57
+
58
  self.retriever = SelfQueryRetriever.from_llm(
59
+ self._model.llm,
60
+ self.vectorstore,
61
+ document_content_description,
62
+ self.metadata_field_info,
63
+ verbose=True,
64
  )
65
+
66
+ self.docs_retriever = db.as_retriever()
server/databases/{banco_de_questoes_v3.txt → banco_de_dados_BIO_HIS_v1.txt} RENAMED
The diff for this file is too large to render. See raw diff
 
server/llm/gemini.py CHANGED
@@ -84,4 +84,4 @@ class Gemini:
84
  )
85
  ]
86
 
87
- self.parser = StructuredOutputParser.from_response_schemas(self.schemas)
 
84
  )
85
  ]
86
 
87
+ self.parser = StructuredOutputParser.from_response_schemas(self.schemas)
server/services/generate_questions_service.py CHANGED
@@ -11,39 +11,43 @@ class GenerateQuestionsService:
11
  _model = Gemini()
12
 
13
  def handle(self, query: str):
14
-
15
  rag_chain = {
16
  "context": self._retrieve.retriever | RunnableLambda(self._format_docs),
17
  "question": RunnablePassthrough(),
18
  } | RunnableLambda(self._get_questions)
 
19
 
20
- return rag_chain.invoke(query)
21
-
22
 
23
- def _get_questions(self, _dict):
 
24
 
25
- question = _dict["question"]
26
- context = _dict["context"]
27
- messages = self._model.template.format_messages(
28
- context=context,
29
- question=question,
30
- format_questions_instructions=self._model._format_questions_instructions,
31
- )
32
 
33
- tries = 0
 
 
 
 
 
 
34
 
35
- while tries < 3:
36
- try:
37
- chat = ChatGoogleGenerativeAI(model="gemini-pro")
38
- response = chat.invoke(messages)
39
- return self._model.parser.parse(response.content)
40
- except Exception as e:
41
- print(e)
42
- tries += 1
43
 
44
- return "Não foi possível gerar as questões."
 
 
 
 
 
 
 
45
 
 
46
 
47
  def _format_docs(self, docs):
48
  return "\n\n".join(doc.page_content for doc in docs)
49
-
 
11
  _model = Gemini()
12
 
13
  def handle(self, query: str):
14
+
15
  rag_chain = {
16
  "context": self._retrieve.retriever | RunnableLambda(self._format_docs),
17
  "question": RunnablePassthrough(),
18
  } | RunnableLambda(self._get_questions)
19
+ response_rag = self._retrieve.docs_retriever
20
 
21
+ rag_result = rag_chain.invoke(query)
22
+ retriever_result = response_rag.invoke(query)
23
 
24
+ print("RAG result:", rag_result)
25
+ print("Retriever result:", retriever_result)
26
 
27
+ return {"rag_result": rag_result, "retriever_result": retriever_result}
28
+
29
+ def _get_questions(self, _dict):
 
 
 
 
30
 
31
+ question = _dict["question"]
32
+ context = _dict["context"]
33
+ messages = self._model.template.format_messages(
34
+ context=context,
35
+ question=question,
36
+ format_questions_instructions=self._model._format_questions_instructions,
37
+ )
38
 
39
+ tries = 0
 
 
 
 
 
 
 
40
 
41
+ while tries < 3:
42
+ try:
43
+ chat = ChatGoogleGenerativeAI(model="gemini-pro")
44
+ response = chat.invoke(messages)
45
+ return self._model.parser.parse(response.content)
46
+ except Exception as e:
47
+ print(e)
48
+ tries += 1
49
 
50
+ return "Não foi possível gerar as questões."
51
 
52
  def _format_docs(self, docs):
53
  return "\n\n".join(doc.page_content for doc in docs)