GlitchGhost's picture
Update app.py
d5d6e44 verified
import os
import fitz
import tempfile
import requests
import streamlit as st
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from transformers import pipeline
from huggingface_hub import HfApi, HfFolder, login
# === Embeddings Wrapper ===
class SentenceTransformerEmbeddings(Embeddings):
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def embed_documents(self, texts):
return self.model.encode(texts).tolist()
def embed_query(self, text):
return self.model.encode([text])[0].tolist()
# === Utility Functions ===
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
return "\n".join([page.get_text() for page in doc])
def split_text(text, chunk_size=500, overlap=50):
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunks.append(text[start:end])
start += chunk_size - overlap
return chunks
def login_to_huggingface(api_key):
try:
HfFolder.save_token(api_key)
st.success("βœ… Logged into Hugging Face successfully!")
except Exception as e:
st.error(f"❌ Failed to log in: {e}")
def ask_mistral(question, context, hf_api_key):
# Load the Hugging Face Mistral model pipeline
nlp = pipeline("question-answering", model="mistralai/Mistral-7B-v0.3", tokenizer="mistralai/Mistral-7B-v0.3", use_auth_token=hf_api_key)
# Format the input
inputs = {
'context': context,
'question': question
}
# Generate the answer using Mistral
answer = nlp(inputs)
return answer['answer']
def create_vectorstore(chunks):
embeddings = SentenceTransformerEmbeddings()
return FAISS.from_texts(chunks, embedding=embeddings)
def generate_answer(vectorstore, question, hf_api_key):
docs = vectorstore.similarity_search(question, k=3)
context = "\n".join([doc.page_content for doc in docs])
return ask_mistral(question, context, hf_api_key), docs
def extract_website_text(url):
try:
res = requests.get(url, timeout=10)
soup = BeautifulSoup(res.text, "html.parser")
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text(separator="\n")
return text.strip()
except Exception as e:
return f"Error extracting website: {e}"
# === Streamlit App ===
st.set_page_config(page_title="πŸ“š Multi-Source RAG Assistant", layout="wide")
st.title("πŸ” RAG Assistant: Chat with PDF, CSV, or Website")
# Sidebar
with st.sidebar:
data_source = st.selectbox("πŸ“‚ Select Input Type", ["PDF", "CSV", "Website URL"])
hf_api_key = st.text_input("πŸ”‘ Enter Hugging Face API Key", type="password")
if hf_api_key:
login_to_huggingface(hf_api_key)
# === Logic by Data Source ===
vectorstore = None
full_data_text = ""
if data_source == "PDF":
pdf_file = st.file_uploader("πŸ“„ Upload PDF", type="pdf")
if pdf_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(pdf_file.read())
text = extract_text_from_pdf(tmp.name)
chunks = split_text(text)
vectorstore = create_vectorstore(chunks)
full_data_text = text
st.success("βœ… PDF processed and indexed!")
elif data_source == "CSV":
csv_file = st.file_uploader("πŸ“Š Upload CSV", type="csv")
if csv_file:
df = pd.read_csv(csv_file)
st.subheader("πŸ” Exploratory Data Analysis")
st.dataframe(df)
st.write("πŸ“ˆ Summary Statistics")
st.write(df.describe(include="all").transpose())
csv_text = df.to_string(index=False)
chunks = split_text(csv_text)
vectorstore = create_vectorstore(chunks)
full_data_text = csv_text
st.success("βœ… CSV indexed and ready for Q&A!")
elif data_source == "Website URL":
url = st.text_input("🌐 Enter Website URL")
if url and st.button("πŸ“₯ Extract Website"):
web_text = extract_website_text(url)
if web_text.startswith("Error"):
st.error(web_text)
else:
chunks = split_text(web_text)
vectorstore = create_vectorstore(chunks)
full_data_text = web_text
st.success("βœ… Website text extracted and indexed!")
# === QA Section ===
if vectorstore and hf_api_key:
st.subheader("❓ Ask a Question")
question = st.text_input("πŸ’¬ Your question")
if question:
with st.spinner("πŸ” Thinking..."):
answer, top_docs = generate_answer(vectorstore, question, hf_api_key)
st.success("🧠 Answer")
st.write(answer)
with st.expander("πŸ“Œ Top Relevant Chunks"):
for i, doc in enumerate(top_docs):
st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content}```")
st.download_button("πŸ“€ Download Answer", answer, file_name="rag_answer.txt")
elif not hf_api_key:
st.info("πŸ” Please enter your Hugging Face API key in the sidebar.")