|
import streamlit as st |
|
import os |
|
|
|
from groq import Groq |
|
from PyPDF2 import PdfReader |
|
from datetime import datetime |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
|
from langchain_community.vectorstores import FAISS |
|
from langchain_groq import ChatGroq |
|
|
|
from langchain.chains.question_answering import load_qa_chain |
|
|
|
st.set_page_config('Lectorín') |
|
st.header("Pregunta a tu PDF") |
|
GROQ_API_KEY = st.text_input('Groq API Key', value="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", type='password') |
|
pdf_obj = st.file_uploader("Carga tu documento", type="pdf", on_change=st.cache_resource.clear) |
|
modelos = { |
|
'multi, 512, 0.47G, 384 - intfloat/multilingual-e5-small': ('intfloat/multilingual-e5-small',512), |
|
'multi, 256, 0.08G, 384 - multi-qa-MiniLM-L6-cos-v1': ('multi-qa-MiniLM-L6-cos-v1',256), |
|
'multi,8192, 2.27G,1024 - BAAI/bge-m3': ('BAAI/bge-m3', 8192), |
|
} |
|
modelo = st.selectbox('Modelo de embedding', list(modelos.keys())) |
|
modelo_embeddings, sequence = modelos[modelo] |
|
chunk_size = sequence * 5 |
|
|
|
modelos_llm = [ |
|
'llama3-70b-8192', |
|
'llama3-8b-8192', |
|
'mixtral-8x7b-32768', |
|
'gemma-7b-it' |
|
] |
|
modelo_llm = st.selectbox('Modelo de lenguaje', list(modelos_llm)) |
|
|
|
|
|
os.environ["LANGCHAIN_TRACING_V2"] = "true" |
|
os.environ["LANGCHAIN_API_KEY"] = "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" |
|
os.environ["LANGCHAIN_PROJECT"] = "qpdf" |
|
|
|
|
|
def save_to_file(): |
|
with open("historial.txt", "a", encoding="utf-8") as archivo: |
|
|
|
archivo.write("-" * 25 ) |
|
fecha_hora_actual = datetime.now().strftime("%Y-%m-%d %H:%M") |
|
archivo.write(f" {fecha_hora_actual} ") |
|
archivo.write(f" ({file_name}) ") |
|
archivo.write("-" * 25 + "\n") |
|
|
|
archivo.write(f"Pregunta: {user_question}\n") |
|
|
|
archivo.write(f"Respuesta: {respuesta}\n") |
|
|
|
|
|
@st.cache_resource |
|
def create_embeddings(pdf): |
|
pdf_reader = PdfReader(pdf) |
|
text = "" |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, |
|
chunk_overlap=150, |
|
length_function=len |
|
) |
|
|
|
chunks = text_splitter.split_text(text) |
|
embeddings = HuggingFaceEmbeddings(model_name=modelo_embeddings) |
|
knowledge_base = FAISS.from_texts(chunks, embeddings) |
|
|
|
return knowledge_base |
|
|
|
|
|
|
|
def mostrar_logs(logs,hints): |
|
|
|
with st.expander("Chunks"): |
|
for hint in hints: |
|
st.write(hint.page_content) |
|
st.write("-" * 30) |
|
|
|
st.sidebar.header("Registro de preguntas") |
|
for entry in logs: |
|
st.sidebar.write(f"**Pregunta: {entry['Pregunta']}**") |
|
st.sidebar.write(f"Respuesta: {entry['Respuesta']}") |
|
|
|
|
|
|
|
logs = [] |
|
|
|
if pdf_obj: |
|
file_name = pdf_obj.name |
|
knowledge_base = create_embeddings(pdf_obj) |
|
user_question = st.text_input("¡A jugar! Haz una pregunta sobre tu PDF:") |
|
|
|
if user_question: |
|
os.environ["GROQ_API_KEY"] = GROQ_API_KEY |
|
|
|
docs = knowledge_base.similarity_search(user_question, 5) |
|
llm = ChatGroq(groq_api_key = os.getenv('GROQ_API_KEY'),model = modelo_llm) |
|
|
|
chain = load_qa_chain(llm, chain_type="stuff") |
|
respuesta = chain.run(input_documents=docs, question=user_question) |
|
|
|
|
|
st.subheader("Respuesta") |
|
st.write(f":green[{str(respuesta)}]") |
|
|
|
|
|
logs.append({"Pregunta": user_question, "Respuesta": respuesta}) |
|
|
|
|
|
mostrar_logs(logs,docs) |
|
|
|
|
|
save_to_file() |
|
|