Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
from llama_index.core import SimpleDirectoryReader, Document, VectorStoreIndex | |
from llama_index.embeddings.openai import OpenAIEmbedding | |
from llama_index.llms.openai import OpenAI | |
from llama_index.core.node_parser import TokenTextSplitter | |
# Configuraci贸n inicial y variables de entorno | |
st.title("Busca en PDFs") | |
# Entradas de usuario para las API keys | |
st.write("Obten tu api-key de OpenAI") | |
OPENAI_API_KEY = st.text_input('OpenAI Api Key', type='password') | |
if OPENAI_API_KEY: | |
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY | |
# Funci贸n principal | |
def main(): | |
st.write("Aplicaci贸n en Streamlit ejecut谩ndose correctamente") | |
# Configura las carpetas necesarias | |
source_data_folder = "MisDatos" | |
path_db = "VectorDB" | |
os.makedirs(source_data_folder, exist_ok=True) | |
os.makedirs(path_db, exist_ok=True) | |
# Carga de archivos PDF | |
uploaded_files = st.file_uploader("Sube archivos PDF", type="pdf", accept_multiple_files=True) | |
if uploaded_files: | |
for uploaded_file in uploaded_files: | |
with open(os.path.join(source_data_folder, uploaded_file.name), "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.write(f"{len(uploaded_files)} archivos subidos exitosamente") | |
# Leer y procesar PDFs | |
loader = SimpleDirectoryReader(source_data_folder) | |
data_on_pdf = loader.load_data() | |
st.write(f"Se han cargado {len(data_on_pdf)} documentos") | |
# Preprocesar texto | |
def preprocess_text(text): | |
cleaned_text = ' '.join(text.split()) | |
return cleaned_text | |
# Particionar datos | |
text_splitter = TokenTextSplitter( | |
chunk_size=1024, | |
chunk_overlap=200 | |
) | |
splits = [] | |
for doc in data_on_pdf: | |
cleaned_text = preprocess_text(doc.text) | |
split_docs = text_splitter.split_text(cleaned_text) | |
split_docs = [Document(text=chunk, metadata=doc.metadata) for chunk in split_docs] | |
splits.extend(split_docs) | |
st.write(f"Se han creado {len(splits)} fragmentos") | |
# Embeddings | |
embeddings_model = OpenAIEmbedding(model="text-embedding-ada-002") | |
# Crear el 铆ndice vectorial | |
index = VectorStoreIndex.from_documents(splits, embedding_model=embeddings_model) | |
# Crear motor de consultas | |
query_engine = index.as_query_engine() | |
# Configuraci贸n del LLM | |
llm = OpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo-0125", temperature=0.8) | |
# Definici贸n del pipeline de procesamiento | |
def rag_pipeline(question): | |
response = query_engine.query(question) | |
#context_docs = response.get_documents() # Obtener documentos correctamente | |
context_docs = display_response(response) | |
context = "\n\n".join(doc.text for doc in context_docs) | |
prompt = f"Contexto:\n{context}\n\nPregunta: {question}\nRespuesta:" | |
llm_response = llm(prompt) | |
return llm_response | |
# Interacci贸n con el usuario | |
pregunta = st.text_area("Haz una pregunta sobre los documentos:") | |
if pregunta: | |
try: | |
response = rag_pipeline(pregunta) | |
st.markdown(response) | |
except Exception as e: | |
st.error(f"Error al procesar la pregunta: {e}") | |
main() | |
else: | |
st.write("Por favor, proporciona la API key para continuar.") |