davidoneilai
retriever funcionando e novo banco de questoes
8514dc9
raw
history blame contribute delete
No virus
2.18 kB
import os
from langchain_community.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_text_splitters import CharacterTextSplitter
from llm.gemini import Gemini
from utils.questions_parser import parse_question
class Retriever:
_model = Gemini()
def __init__(self):
if "DATA_PATH" not in os.environ:
raise ValueError("DATA_PATH environment variable is not set")
DATA_PATH = os.environ["DATA_PATH"]
data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load()
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
docs = text_splitter.split_documents(data_loader)
self.vectorstore = Chroma.from_documents(
docs, self._model.embeddings, persist_directory="./chroma_db"
)
self.metadata_field_info = [
AttributeInfo(
name="topico",
description="A materia escolar da qual a questão pertence.",
type="string",
),
AttributeInfo(
name="assunto",
description="O assunto da materia fornecida anteriormente.",
type="string",
),
AttributeInfo(
name="dificuldade",
description="O nivel de dificuldade para resolver a questao.",
type="string",
),
AttributeInfo(
name="tipo",
description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa",
type="string",
),
]
document_content_description = "Questões de matérias do ensino médio."
db = Chroma.from_documents(docs, self._model.embeddings)
self.retriever = SelfQueryRetriever.from_llm(
self._model.llm,
self.vectorstore,
document_content_description,
self.metadata_field_info,
verbose=True,
)
self.docs_retriever = db.as_retriever()