Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
from langchain.document_loaders import PyPDFDirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.llms import OpenAI | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables import RunnablePassthrough | |
# Configuraci贸n inicial y variables de entorno | |
st.title("Busca en PDFs") | |
# Entradas de usuario para las API keys | |
OPENAI_API_KEY = st.text_input('OpenAI Api Key', type='password') | |
if OPENAI_API_KEY: | |
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY | |
# Funci贸n principal | |
def main(): | |
st.write("Aplicaci贸n en Streamlit ejecut谩ndose correctamente") | |
# Configura las carpetas necesarias | |
source_data_folder = "src" | |
path_db = "VectorDB" | |
os.makedirs(source_data_folder, exist_ok=True) | |
os.makedirs(path_db, exist_ok=True) | |
# Carga de archivos PDF | |
uploaded_files = st.file_uploader("Sube archivos PDF", type="pdf", accept_multiple_files=True) | |
if uploaded_files: | |
for uploaded_file in uploaded_files: | |
with open(os.path.join(source_data_folder, uploaded_file.name), "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.write(f"{len(uploaded_files)} archivos subidos exitosamente") | |
# Leer y procesar PDFs | |
loader = PyPDFDirectoryLoader(source_data_folder) | |
data_on_pdf = loader.load() | |
st.write(f"Se han cargado {len(data_on_pdf)} documentos") | |
# Particionar datos | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators=["\n\n", "\n", ". ", " ", ""], | |
chunk_size=1024, | |
chunk_overlap=200 | |
) | |
splits = text_splitter.split_documents(data_on_pdf) | |
st.write(f"Se han creado {len(splits)} fragmentos") | |
# Embeddings | |
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002") | |
# Base de datos vectorial | |
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=path_db) | |
# Configuraci贸n del LLM | |
llm = OpenAI(model_name="gpt-3.5-turbo-0125", openai_api_key=OPENAI_API_KEY, temperature=0.8) | |
retriever = vectorstore.as_retriever() | |
prompt_template = """ | |
Responde en espa帽ol como un experto en an谩lisis de datos: | |
{context} | |
Pregunta: {question} | |
""" | |
def format_docs(docs): | |
return "\n\n".join(doc.page_content for doc in docs) | |
def create_prompt(context, question): | |
return prompt_template.format(context=context, question=question) | |
rag_chain = ( | |
{"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| create_prompt | |
| llm | |
| StrOutputParser() | |
) | |
# Interacci贸n con el usuario | |
pregunta = st.text_input("Haz una pregunta sobre los documentos:") | |
if pregunta: | |
try: | |
response = rag_chain.invoke(pregunta) | |
st.markdown(response) | |
except Exception as e: | |
st.error(f"Error al procesar la pregunta: {e}") | |
if __name__ == "__main__": | |
main() | |
else: | |
st.write("Por favor, proporciona las API keys para continuar.") |