import streamlit as st import os from langchain.document_loaders import PyPDFDirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import OpenAIEmbeddings from langchain.llms import OpenAI from langchain_core.output_parsers import StrOutputParser from langchain import hub from langchain_core.runnables import RunnablePassthrough from langchain.indexes import VectorstoreIndexCreator # Definir la clase Document class Document: def __init__(self, page_content, metadata=None): self.page_content = page_content self.metadata = metadata if metadata else {} # Configuración inicial y variables de entorno st.title("Busca en PDFs") # Entradas de usuario para las API keys st.write("Obten tu api-key de https://smith.langchain.com") OPENAI_API_KEY = st.text_input('OpenAI Api Key', type='password') SMITH_APIKEY = st.text_input('Smith Api Key', type='password') if OPENAI_API_KEY and SMITH_APIKEY: os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY os.environ["LANGCHAIN_TRACING_V2"] = "true" os.environ["LANGCHAIN_API_KEY"] = SMITH_APIKEY # Función principal def main(): st.write("Aplicación en Streamlit ejecutándose correctamente") # Configura las carpetas necesarias source_data_folder = "MisDatos" path_db = "VectorDB" os.makedirs(source_data_folder, exist_ok=True) os.makedirs(path_db, exist_ok=True) # Carga de archivos PDF uploaded_files = st.file_uploader("Sube archivos PDF", type="pdf", accept_multiple_files=True) if uploaded_files: for uploaded_file in uploaded_files: with open(os.path.join(source_data_folder, uploaded_file.name), "wb") as f: f.write(uploaded_file.getbuffer()) st.write(f"{len(uploaded_files)} archivos subidos exitosamente") # Leer y procesar PDFs loader = PyPDFDirectoryLoader(source_data_folder) data_on_pdf = loader.load() st.write(f"Se han cargado {len(data_on_pdf)} documentos") # Preprocesar texto def preprocess_text(text): cleaned_text = ' '.join(text.split()) return cleaned_text # Particionar datos text_splitter = RecursiveCharacterTextSplitter( separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1024, chunk_overlap=200 ) splits = [] for doc in data_on_pdf: cleaned_text = preprocess_text(doc.page_content) split_docs = text_splitter.split_documents([Document(page_content=cleaned_text, metadata=doc.metadata)]) splits.extend(split_docs) st.write(f"Se han creado {len(splits)} fragmentos") # Embeddings embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002") # Crear y persistir la base de datos vectorial vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=path_db) # Crear el índice usando VectorstoreIndexCreator index_creator = VectorstoreIndexCreator(embedding=embeddings_model) index = vectorstore.as_retriever() st.write(f"El tipo de índice es: {type(index)}") # Configuración del LLM llm = OpenAI(model_name="gpt-3.5-turbo-0125", temperature=0.8) # Configuración del retriever retriever = vectorstore.as_retriever() # Cargar el prompt desde LangChain Hub prompt = hub.pull("rlm/rag-prompt") def format_docs(docs): return "\n\n".join(doc.page_content for doc in docs) # Definición del pipeline de procesamiento rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) # Interacción con el usuario pregunta = st.text_area("Haz una pregunta sobre los documentos:") if pregunta: try: response = rag_chain.invoke(pregunta) st.markdown(response) except Exception as e: st.error(f"Error al procesar la pregunta: {e}") if __name__ == "__main__": main() else: st.write("Por favor, proporciona las API keys para continuar.")