import os import fitz import tempfile import requests import streamlit as st import pandas as pd from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer from langchain_community.vectorstores import FAISS from langchain.embeddings.base import Embeddings from transformers import pipeline from huggingface_hub import HfApi, HfFolder, login # === Embeddings Wrapper === class SentenceTransformerEmbeddings(Embeddings): def __init__(self, model_name="all-MiniLM-L6-v2"): self.model = SentenceTransformer(model_name) def embed_documents(self, texts): return self.model.encode(texts).tolist() def embed_query(self, text): return self.model.encode([text])[0].tolist() # === Utility Functions === def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) return "\n".join([page.get_text() for page in doc]) def split_text(text, chunk_size=500, overlap=50): chunks = [] start = 0 while start < len(text): end = min(start + chunk_size, len(text)) chunks.append(text[start:end]) start += chunk_size - overlap return chunks def login_to_huggingface(api_key): try: HfFolder.save_token(api_key) st.success("✅ Logged into Hugging Face successfully!") except Exception as e: st.error(f"❌ Failed to log in: {e}") def ask_mistral(question, context, hf_api_key): # Load the Hugging Face Mistral model pipeline nlp = pipeline("question-answering", model="mistralai/Mistral-7B-v0.3", tokenizer="mistralai/Mistral-7B-v0.3", use_auth_token=hf_api_key) # Format the input inputs = { 'context': context, 'question': question } # Generate the answer using Mistral answer = nlp(inputs) return answer['answer'] def create_vectorstore(chunks): embeddings = SentenceTransformerEmbeddings() return FAISS.from_texts(chunks, embedding=embeddings) def generate_answer(vectorstore, question, hf_api_key): docs = vectorstore.similarity_search(question, k=3) context = "\n".join([doc.page_content for doc in docs]) return ask_mistral(question, context, hf_api_key), docs def extract_website_text(url): try: res = requests.get(url, timeout=10) soup = BeautifulSoup(res.text, "html.parser") for script in soup(["script", "style"]): script.decompose() text = soup.get_text(separator="\n") return text.strip() except Exception as e: return f"Error extracting website: {e}" # === Streamlit App === st.set_page_config(page_title="📚 Multi-Source RAG Assistant", layout="wide") st.title("🔍 RAG Assistant: Chat with PDF, CSV, or Website") # Sidebar with st.sidebar: data_source = st.selectbox("📂 Select Input Type", ["PDF", "CSV", "Website URL"]) hf_api_key = st.text_input("🔑 Enter Hugging Face API Key", type="password") if hf_api_key: login_to_huggingface(hf_api_key) # === Logic by Data Source === vectorstore = None full_data_text = "" if data_source == "PDF": pdf_file = st.file_uploader("📄 Upload PDF", type="pdf") if pdf_file: with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: tmp.write(pdf_file.read()) text = extract_text_from_pdf(tmp.name) chunks = split_text(text) vectorstore = create_vectorstore(chunks) full_data_text = text st.success("✅ PDF processed and indexed!") elif data_source == "CSV": csv_file = st.file_uploader("📊 Upload CSV", type="csv") if csv_file: df = pd.read_csv(csv_file) st.subheader("🔍 Exploratory Data Analysis") st.dataframe(df) st.write("📈 Summary Statistics") st.write(df.describe(include="all").transpose()) csv_text = df.to_string(index=False) chunks = split_text(csv_text) vectorstore = create_vectorstore(chunks) full_data_text = csv_text st.success("✅ CSV indexed and ready for Q&A!") elif data_source == "Website URL": url = st.text_input("🌐 Enter Website URL") if url and st.button("📥 Extract Website"): web_text = extract_website_text(url) if web_text.startswith("Error"): st.error(web_text) else: chunks = split_text(web_text) vectorstore = create_vectorstore(chunks) full_data_text = web_text st.success("✅ Website text extracted and indexed!") # === QA Section === if vectorstore and hf_api_key: st.subheader("❓ Ask a Question") question = st.text_input("💬 Your question") if question: with st.spinner("🔍 Thinking..."): answer, top_docs = generate_answer(vectorstore, question, hf_api_key) st.success("🧠 Answer") st.write(answer) with st.expander("📌 Top Relevant Chunks"): for i, doc in enumerate(top_docs): st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content}```") st.download_button("📤 Download Answer", answer, file_name="rag_answer.txt") elif not hf_api_key: st.info("🔐 Please enter your Hugging Face API key in the sidebar.")