|
import os |
|
import fitz |
|
import tempfile |
|
import requests |
|
import streamlit as st |
|
import pandas as pd |
|
from bs4 import BeautifulSoup |
|
from sentence_transformers import SentenceTransformer |
|
from langchain_community.vectorstores import FAISS |
|
from langchain.embeddings.base import Embeddings |
|
from transformers import pipeline |
|
from huggingface_hub import HfApi, HfFolder, login |
|
|
|
|
|
|
|
class SentenceTransformerEmbeddings(Embeddings): |
|
def __init__(self, model_name="all-MiniLM-L6-v2"): |
|
self.model = SentenceTransformer(model_name) |
|
|
|
def embed_documents(self, texts): |
|
return self.model.encode(texts).tolist() |
|
|
|
def embed_query(self, text): |
|
return self.model.encode([text])[0].tolist() |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
doc = fitz.open(pdf_path) |
|
return "\n".join([page.get_text() for page in doc]) |
|
|
|
def split_text(text, chunk_size=500, overlap=50): |
|
chunks = [] |
|
start = 0 |
|
while start < len(text): |
|
end = min(start + chunk_size, len(text)) |
|
chunks.append(text[start:end]) |
|
start += chunk_size - overlap |
|
return chunks |
|
|
|
def login_to_huggingface(api_key): |
|
try: |
|
HfFolder.save_token(api_key) |
|
st.success("β
Logged into Hugging Face successfully!") |
|
except Exception as e: |
|
st.error(f"β Failed to log in: {e}") |
|
|
|
def ask_mistral(question, context, hf_api_key): |
|
|
|
nlp = pipeline("question-answering", model="mistralai/Mistral-7B-v0.3", tokenizer="mistralai/Mistral-7B-v0.3", use_auth_token=hf_api_key) |
|
|
|
|
|
inputs = { |
|
'context': context, |
|
'question': question |
|
} |
|
|
|
|
|
answer = nlp(inputs) |
|
return answer['answer'] |
|
|
|
def create_vectorstore(chunks): |
|
embeddings = SentenceTransformerEmbeddings() |
|
return FAISS.from_texts(chunks, embedding=embeddings) |
|
|
|
def generate_answer(vectorstore, question, hf_api_key): |
|
docs = vectorstore.similarity_search(question, k=3) |
|
context = "\n".join([doc.page_content for doc in docs]) |
|
return ask_mistral(question, context, hf_api_key), docs |
|
|
|
def extract_website_text(url): |
|
try: |
|
res = requests.get(url, timeout=10) |
|
soup = BeautifulSoup(res.text, "html.parser") |
|
for script in soup(["script", "style"]): |
|
script.decompose() |
|
text = soup.get_text(separator="\n") |
|
return text.strip() |
|
except Exception as e: |
|
return f"Error extracting website: {e}" |
|
|
|
|
|
st.set_page_config(page_title="π Multi-Source RAG Assistant", layout="wide") |
|
st.title("π RAG Assistant: Chat with PDF, CSV, or Website") |
|
|
|
|
|
with st.sidebar: |
|
data_source = st.selectbox("π Select Input Type", ["PDF", "CSV", "Website URL"]) |
|
hf_api_key = st.text_input("π Enter Hugging Face API Key", type="password") |
|
if hf_api_key: |
|
login_to_huggingface(hf_api_key) |
|
|
|
|
|
vectorstore = None |
|
full_data_text = "" |
|
|
|
if data_source == "PDF": |
|
pdf_file = st.file_uploader("π Upload PDF", type="pdf") |
|
if pdf_file: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: |
|
tmp.write(pdf_file.read()) |
|
text = extract_text_from_pdf(tmp.name) |
|
chunks = split_text(text) |
|
vectorstore = create_vectorstore(chunks) |
|
full_data_text = text |
|
st.success("β
PDF processed and indexed!") |
|
|
|
elif data_source == "CSV": |
|
csv_file = st.file_uploader("π Upload CSV", type="csv") |
|
if csv_file: |
|
df = pd.read_csv(csv_file) |
|
st.subheader("π Exploratory Data Analysis") |
|
st.dataframe(df) |
|
st.write("π Summary Statistics") |
|
st.write(df.describe(include="all").transpose()) |
|
|
|
csv_text = df.to_string(index=False) |
|
chunks = split_text(csv_text) |
|
vectorstore = create_vectorstore(chunks) |
|
full_data_text = csv_text |
|
st.success("β
CSV indexed and ready for Q&A!") |
|
|
|
elif data_source == "Website URL": |
|
url = st.text_input("π Enter Website URL") |
|
if url and st.button("π₯ Extract Website"): |
|
web_text = extract_website_text(url) |
|
if web_text.startswith("Error"): |
|
st.error(web_text) |
|
else: |
|
chunks = split_text(web_text) |
|
vectorstore = create_vectorstore(chunks) |
|
full_data_text = web_text |
|
st.success("β
Website text extracted and indexed!") |
|
|
|
|
|
if vectorstore and hf_api_key: |
|
st.subheader("β Ask a Question") |
|
question = st.text_input("π¬ Your question") |
|
if question: |
|
with st.spinner("π Thinking..."): |
|
answer, top_docs = generate_answer(vectorstore, question, hf_api_key) |
|
st.success("π§ Answer") |
|
st.write(answer) |
|
|
|
with st.expander("π Top Relevant Chunks"): |
|
for i, doc in enumerate(top_docs): |
|
st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content}```") |
|
|
|
st.download_button("π€ Download Answer", answer, file_name="rag_answer.txt") |
|
|
|
elif not hf_api_key: |
|
st.info("π Please enter your Hugging Face API key in the sidebar.") |
|
|