File size: 4,596 Bytes
6aa138d
 
d9ffb26
6aa138d
 
 
 
 
7b166af
ef508a8
04c7ee6
6aa138d
d9ffb26
 
bd15a6d
d9ffb26
bd15a6d
d9ffb26
6aa138d
 
 
 
66bf2f3
6aa138d
7b166af
 
 
6aa138d
bd3af2e
7b166af
6aa138d
 
 
 
 
 
04c7ee6
6aa138d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fa75b8
 
 
 
 
6aa138d
 
 
 
 
 
6fa75b8
 
 
bd15a6d
6fa75b8
6aa138d
 
 
 
 
04c7ee6
d56526e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import os
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain.indexes import VectorstoreIndexCreator

# Definir la clase Document
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata else {}

# Configuraci贸n inicial y variables de entorno
st.title("Busca en PDFs")

# Entradas de usuario para las API keys
st.write("Obten tu api-key de https://smith.langchain.com")
OPENAI_API_KEY = st.text_input('OpenAI Api Key', type='password')
SMITH_APIKEY = st.text_input('Smith Api Key', type='password')

if OPENAI_API_KEY and SMITH_APIKEY:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    os.environ["LANGCHAIN_TRACING_V2"] = "true"
    os.environ["LANGCHAIN_API_KEY"] = SMITH_APIKEY

    # Funci贸n principal
    def main():
        st.write("Aplicaci贸n en Streamlit ejecut谩ndose correctamente")

        # Configura las carpetas necesarias
        source_data_folder = "MisDatos"
        path_db = "VectorDB"
        os.makedirs(source_data_folder, exist_ok=True)
        os.makedirs(path_db, exist_ok=True)

        # Carga de archivos PDF
        uploaded_files = st.file_uploader("Sube archivos PDF", type="pdf", accept_multiple_files=True)
        if uploaded_files:
            for uploaded_file in uploaded_files:
                with open(os.path.join(source_data_folder, uploaded_file.name), "wb") as f:
                    f.write(uploaded_file.getbuffer())
            st.write(f"{len(uploaded_files)} archivos subidos exitosamente")

            # Leer y procesar PDFs
            loader = PyPDFDirectoryLoader(source_data_folder)
            data_on_pdf = loader.load()
            st.write(f"Se han cargado {len(data_on_pdf)} documentos")

            # Preprocesar texto
            def preprocess_text(text):
                cleaned_text = ' '.join(text.split())
                return cleaned_text

            # Particionar datos
            text_splitter = RecursiveCharacterTextSplitter(
                separators=["\n\n", "\n", ". ", " ", ""],
                chunk_size=1024,
                chunk_overlap=200
            )
            splits = []
            for doc in data_on_pdf:
                cleaned_text = preprocess_text(doc.page_content)
                split_docs = text_splitter.split_documents([Document(page_content=cleaned_text, metadata=doc.metadata)])
                splits.extend(split_docs)
            st.write(f"Se han creado {len(splits)} fragmentos")

            # Embeddings
            embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")

            # Crear y persistir la base de datos vectorial
            vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory=path_db)

            # Crear el 铆ndice usando VectorstoreIndexCreator
            index_creator = VectorstoreIndexCreator(embedding=embeddings_model)
            index = vectorstore.as_retriever()
            st.write(f"El tipo de 铆ndice es: {type(index)}")

            # Configuraci贸n del LLM
            llm = OpenAI(model_name="gpt-3.5-turbo-0125", temperature=0.8)

            # Configuraci贸n del retriever
            retriever = vectorstore.as_retriever()

            # Cargar el prompt desde LangChain Hub
            prompt = hub.pull("rlm/rag-prompt")

            def format_docs(docs):
                return "\n\n".join(doc.page_content for doc in docs)

            # Definici贸n del pipeline de procesamiento
            rag_chain = (
                {"context": retriever | format_docs, "question": RunnablePassthrough()}
                | prompt
                | llm
                | StrOutputParser()
            )

            # Interacci贸n con el usuario
            pregunta = st.text_area("Haz una pregunta sobre los documentos:")
            if pregunta:
                try:
                    response = rag_chain.invoke(pregunta)
                    st.markdown(response)
                except Exception as e:
                    st.error(f"Error al procesar la pregunta: {e}")

    if __name__ == "__main__":
        main()
else:
    st.write("Por favor, proporciona las API keys para continuar.")