BasicoRag / app.py
JairoCesar's picture
Update app.py
aa62f6c verified
raw
history blame
No virus
4.23 kB
import os
import hashlib
import pickle
import streamlit as st
from huggingface_hub import InferenceClient
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import PyPDF2
# Initialize the client
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
# Initialize Sentence Transformer model
encoder = SentenceTransformer("all-mpnet-base-v2")
# Function to compute directory hash
def compute_directory_hash(directory):
hash_md5 = hashlib.md5()
for root, _, files in os.walk(directory):
for file in sorted(files):
file_path = os.path.join(root, file)
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
# Load documents and create embeddings
def load_documents_and_create_embeddings(directory):
documents = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(".pdf"):
file_path = os.path.join(root, file)
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text()
documents.append(text)
embeddings = encoder.encode(documents)
return documents, embeddings
# Load or update cache
def load_or_update_cache(directory):
cache_file = "cache.pkl"
dir_hash = compute_directory_hash(directory)
if os.path.exists(cache_file):
with open(cache_file, "rb") as f:
cache = pickle.load(f)
if cache["hash"] == dir_hash:
return cache["documents"], cache["embeddings"]
documents, embeddings = load_documents_and_create_embeddings(directory)
with open(cache_file, "wb") as f:
pickle.dump({
"hash": dir_hash,
"documents": documents,
"embeddings": embeddings
}, f)
return documents, embeddings
# Function to format the prompt
def format_prompt(message, history):
prompt = "<s>"
for user_prompt, bot_response in history:
prompt += f"[INST] {user_prompt} [/INST]"
prompt += f" {bot_response} "
prompt += f"[INST] {message} [/INST]"
return prompt
# Function to generate response
def generate(prompt, history, temperature=0.3, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
temperature = max(float(temperature), 1e-2)
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
formatted_prompt = format_prompt(prompt, history)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
return output
# Load documents and create embeddings
directory = "./data"
documents, embeddings = load_or_update_cache(directory)
# Streamlit interface
st.title("Preguntale al Buho")
# Chat history
if 'history' not in st.session_state:
st.session_state.history = []
# User input
user_input = st.text_input("Tu duda:", key="user_input")
# Generate response and update history
if st.button("Enviar"):
if user_input:
question_embedding = encoder.encode([user_input])
similarities = cosine_similarity(question_embedding, embeddings)
most_similar_idx = np.argmax(similarities)
retrieved_doc = documents[most_similar_idx]
history = st.session_state.history.copy()
prompt = f"Contexto: {retrieved_doc}\nPregunta: {user_input}"
bot_response = generate(prompt, history)
st.session_state.history.append((user_input, bot_response))
# Display conversation
chat_text = ""
for user_msg, bot_msg in st.session_state.history:
chat_text += f"Tu: {user_msg}\nBuhIA: {bot_msg}\n\n"
st.text_area("La respuesta", value=chat_text, height=300, disabled=False)