Spaces:

GlitchGhost
/

Multi-Source-RAG-Assistant

Sleeping

App Files Files Community

Multi-Source-RAG-Assistant / app.py

GlitchGhost

Update app.py

d5d6e44 verified 4 months ago

raw

history blame contribute delete

5.24 kB

	import os
	import fitz
	import tempfile
	import requests
	import streamlit as st
	import pandas as pd
	from bs4 import BeautifulSoup
	from sentence_transformers import SentenceTransformer
	from langchain_community.vectorstores import FAISS
	from langchain.embeddings.base import Embeddings
	from transformers import pipeline
	from huggingface_hub import HfApi, HfFolder, login


	# === Embeddings Wrapper ===
	class SentenceTransformerEmbeddings(Embeddings):
	def __init__(self, model_name="all-MiniLM-L6-v2"):
	self.model = SentenceTransformer(model_name)

	def embed_documents(self, texts):
	return self.model.encode(texts).tolist()

	def embed_query(self, text):
	return self.model.encode([text])[0].tolist()

	# === Utility Functions ===
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	return "\n".join([page.get_text() for page in doc])

	def split_text(text, chunk_size=500, overlap=50):
	chunks = []
	start = 0
	while start < len(text):
	end = min(start + chunk_size, len(text))
	chunks.append(text[start:end])
	start += chunk_size - overlap
	return chunks

	def login_to_huggingface(api_key):
	try:
	HfFolder.save_token(api_key)
	st.success("✅ Logged into Hugging Face successfully!")
	except Exception as e:
	st.error(f"❌ Failed to log in: {e}")

	def ask_mistral(question, context, hf_api_key):
	# Load the Hugging Face Mistral model pipeline
	nlp = pipeline("question-answering", model="mistralai/Mistral-7B-v0.3", tokenizer="mistralai/Mistral-7B-v0.3", use_auth_token=hf_api_key)

	# Format the input
	inputs = {
	'context': context,
	'question': question
	}

	# Generate the answer using Mistral
	answer = nlp(inputs)
	return answer['answer']

	def create_vectorstore(chunks):
	embeddings = SentenceTransformerEmbeddings()
	return FAISS.from_texts(chunks, embedding=embeddings)

	def generate_answer(vectorstore, question, hf_api_key):
	docs = vectorstore.similarity_search(question, k=3)
	context = "\n".join([doc.page_content for doc in docs])
	return ask_mistral(question, context, hf_api_key), docs

	def extract_website_text(url):
	try:
	res = requests.get(url, timeout=10)
	soup = BeautifulSoup(res.text, "html.parser")
	for script in soup(["script", "style"]):
	script.decompose()
	text = soup.get_text(separator="\n")
	return text.strip()
	except Exception as e:
	return f"Error extracting website: {e}"

	# === Streamlit App ===
	st.set_page_config(page_title="📚 Multi-Source RAG Assistant", layout="wide")
	st.title("🔍 RAG Assistant: Chat with PDF, CSV, or Website")

	# Sidebar
	with st.sidebar:
	data_source = st.selectbox("📂 Select Input Type", ["PDF", "CSV", "Website URL"])
	hf_api_key = st.text_input("🔑 Enter Hugging Face API Key", type="password")
	if hf_api_key:
	login_to_huggingface(hf_api_key)

	# === Logic by Data Source ===
	vectorstore = None
	full_data_text = ""

	if data_source == "PDF":
	pdf_file = st.file_uploader("📄 Upload PDF", type="pdf")
	if pdf_file:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
	tmp.write(pdf_file.read())
	text = extract_text_from_pdf(tmp.name)
	chunks = split_text(text)
	vectorstore = create_vectorstore(chunks)
	full_data_text = text
	st.success("✅ PDF processed and indexed!")

	elif data_source == "CSV":
	csv_file = st.file_uploader("📊 Upload CSV", type="csv")
	if csv_file:
	df = pd.read_csv(csv_file)
	st.subheader("🔍 Exploratory Data Analysis")
	st.dataframe(df)
	st.write("📈 Summary Statistics")
	st.write(df.describe(include="all").transpose())

	csv_text = df.to_string(index=False)
	chunks = split_text(csv_text)
	vectorstore = create_vectorstore(chunks)
	full_data_text = csv_text
	st.success("✅ CSV indexed and ready for Q&A!")

	elif data_source == "Website URL":
	url = st.text_input("🌐 Enter Website URL")
	if url and st.button("📥 Extract Website"):
	web_text = extract_website_text(url)
	if web_text.startswith("Error"):
	st.error(web_text)
	else:
	chunks = split_text(web_text)
	vectorstore = create_vectorstore(chunks)
	full_data_text = web_text
	st.success("✅ Website text extracted and indexed!")

	# === QA Section ===
	if vectorstore and hf_api_key:
	st.subheader("❓ Ask a Question")
	question = st.text_input("💬 Your question")
	if question:
	with st.spinner("🔍 Thinking..."):
	answer, top_docs = generate_answer(vectorstore, question, hf_api_key)
	st.success("🧠 Answer")
	st.write(answer)

	with st.expander("📌 Top Relevant Chunks"):
	for i, doc in enumerate(top_docs):
	st.markdown(f"Chunk {i+1}:\n```{doc.page_content}```")

	st.download_button("📤 Download Answer", answer, file_name="rag_answer.txt")

	elif not hf_api_key:
	st.info("🔐 Please enter your Hugging Face API key in the sidebar.")