import os from langchain_community.document_loaders import TextLoader from langchain.vectorstores import Chroma from langchain.chains.query_constructor.base import AttributeInfo from langchain.retrievers.self_query.base import SelfQueryRetriever from langchain_text_splitters import CharacterTextSplitter from llm.gemini import Gemini from utils.questions_parser import parse_question class Retriever: _model = Gemini() def __init__(self): if "DATA_PATH" not in os.environ: raise ValueError("DATA_PATH environment variable is not set") DATA_PATH = os.environ["DATA_PATH"] data_loader = TextLoader(DATA_PATH, encoding="UTF-8").load() text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0) docs = text_splitter.split_documents(data_loader) self.vectorstore = Chroma.from_documents( docs, self._model.embeddings, persist_directory="./chroma_db" ) self.metadata_field_info = [ AttributeInfo( name="topico", description="A materia escolar da qual a questão pertence.", type="string", ), AttributeInfo( name="assunto", description="O assunto da materia fornecida anteriormente.", type="string", ), AttributeInfo( name="dificuldade", description="O nivel de dificuldade para resolver a questao.", type="string", ), AttributeInfo( name="tipo", description="O tipo da questao. Pode ser ou Multipla Escolha ou Justificativa", type="string", ), ] document_content_description = "Questões de matérias do ensino médio." db = Chroma.from_documents(docs, self._model.embeddings) self.retriever = SelfQueryRetriever.from_llm( self._model.llm, self.vectorstore, document_content_description, self.metadata_field_info, verbose=True, ) self.docs_retriever = db.as_retriever()