BasicoRag

Running

App Files Files Community

BasicoRag / app.py

JairoCesar

Update app.py

aa62f6c verified about 9 hours ago

raw

history blame

No virus

4.23 kB

	import os
	import hashlib
	import pickle
	import streamlit as st
	from huggingface_hub import InferenceClient
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	import PyPDF2

	# Initialize the client
	client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

	# Initialize Sentence Transformer model
	encoder = SentenceTransformer("all-mpnet-base-v2")

	# Function to compute directory hash
	def compute_directory_hash(directory):
	hash_md5 = hashlib.md5()
	for root, _, files in os.walk(directory):
	for file in sorted(files):
	file_path = os.path.join(root, file)
	with open(file_path, "rb") as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hash_md5.update(chunk)
	return hash_md5.hexdigest()

	# Load documents and create embeddings
	def load_documents_and_create_embeddings(directory):
	documents = []
	for root, _, files in os.walk(directory):
	for file in files:
	if file.endswith(".pdf"):
	file_path = os.path.join(root, file)
	with open(file_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	documents.append(text)

	embeddings = encoder.encode(documents)
	return documents, embeddings

	# Load or update cache
	def load_or_update_cache(directory):
	cache_file = "cache.pkl"
	dir_hash = compute_directory_hash(directory)

	if os.path.exists(cache_file):
	with open(cache_file, "rb") as f:
	cache = pickle.load(f)
	if cache["hash"] == dir_hash:
	return cache["documents"], cache["embeddings"]

	documents, embeddings = load_documents_and_create_embeddings(directory)
	with open(cache_file, "wb") as f:
	pickle.dump({
	"hash": dir_hash,
	"documents": documents,
	"embeddings": embeddings
	}, f)

	return documents, embeddings

	# Function to format the prompt
	def format_prompt(message, history):
	prompt = "<s>"
	for user_prompt, bot_response in history:
	prompt += f"[INST] {user_prompt} [/INST]"
	prompt += f" {bot_response} "
	prompt += f"[INST] {message} [/INST]"
	return prompt

	# Function to generate response
	def generate(prompt, history, temperature=0.3, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
	temperature = max(float(temperature), 1e-2)
	top_p = float(top_p)

	generate_kwargs = dict(
	temperature=temperature,
	max_new_tokens=max_new_tokens,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	do_sample=True,
	seed=42,
	)

	formatted_prompt = format_prompt(prompt, history)

	stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
	output = ""
	for response in stream:
	output += response.token.text
	return output

	# Load documents and create embeddings
	directory = "./data"
	documents, embeddings = load_or_update_cache(directory)

	# Streamlit interface
	st.title("Preguntale al Buho")

	# Chat history
	if 'history' not in st.session_state:
	st.session_state.history = []

	# User input
	user_input = st.text_input("Tu duda:", key="user_input")

	# Generate response and update history
	if st.button("Enviar"):
	if user_input:
	question_embedding = encoder.encode([user_input])
	similarities = cosine_similarity(question_embedding, embeddings)
	most_similar_idx = np.argmax(similarities)
	retrieved_doc = documents[most_similar_idx]
	history = st.session_state.history.copy()
	prompt = f"Contexto: {retrieved_doc}\nPregunta: {user_input}"
	bot_response = generate(prompt, history)
	st.session_state.history.append((user_input, bot_response))

	# Display conversation
	chat_text = ""
	for user_msg, bot_msg in st.session_state.history:
	chat_text += f"Tu: {user_msg}\nBuhIA: {bot_msg}\n\n"
	st.text_area("La respuesta", value=chat_text, height=300, disabled=False)