openai-RAG / app.py
arman77mxx's picture
inicio archivo
6aa138d verified
raw
history blame
No virus
3.57 kB
import streamlit as st
import os
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# Configuraci贸n inicial y variables de entorno
st.title("Busca en PDFs")
# Entradas de usuario para las API keys
OPENAI_API_KEY = st.text_input('OpenAI Api Key', type='password')
if OPENAI_API_KEY:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# Funci贸n principal
def main():
st.write("Aplicaci贸n en Streamlit ejecut谩ndose correctamente")
# Configura las carpetas necesarias
source_data_folder = "src"
path_db = "VectorDB"
os.makedirs(source_data_folder, exist_ok=True)
os.makedirs(path_db, exist_ok=True)
# Carga de archivos PDF
uploaded_files = st.file_uploader("Sube archivos PDF", type="pdf", accept_multiple_files=True)
if uploaded_files:
for uploaded_file in uploaded_files:
with open(os.path.join(source_data_folder, uploaded_file.name), "wb") as f:
f.write(uploaded_file.getbuffer())
st.write(f"{len(uploaded_files)} archivos subidos exitosamente")
# Leer y procesar PDFs
loader = PyPDFDirectoryLoader(source_data_folder)
data_on_pdf = loader.load()
st.write(f"Se han cargado {len(data_on_pdf)} documentos")
# Particionar datos
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", ". ", " ", ""],
chunk_size=1024,
chunk_overlap=200
)
splits = text_splitter.split_documents(data_on_pdf)
st.write(f"Se han creado {len(splits)} fragmentos")
# Embeddings
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
# Base de datos vectorial
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=path_db)
# Configuraci贸n del LLM
llm = OpenAI(model_name="gpt-3.5-turbo-0125", openai_api_key=OPENAI_API_KEY, temperature=0.8)
retriever = vectorstore.as_retriever()
prompt_template = """
Responde en espa帽ol como un experto en an谩lisis de datos:
{context}
Pregunta: {question}
"""
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
def create_prompt(context, question):
return prompt_template.format(context=context, question=question)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| create_prompt
| llm
| StrOutputParser()
)
# Interacci贸n con el usuario
pregunta = st.text_input("Haz una pregunta sobre los documentos:")
if pregunta:
try:
response = rag_chain.invoke(pregunta)
st.markdown(response)
except Exception as e:
st.error(f"Error al procesar la pregunta: {e}")
if __name__ == "__main__":
main()
else:
st.write("Por favor, proporciona las API keys para continuar.")