Spaces:

Seyfelislem
/

pdf_chat_assistant

Sleeping

App Files Files Community

Seif-aber commited on Aug 30

Commit

edac567

1 Parent(s): 46d0d8e

implemented pdf chat assistant with gemini and RAG

Browse files

Files changed (21) hide show

Dockerfile +0 -20
README.md +60 -13
config/__init__.py +0 -0
config/settings.py +58 -0
requirements.txt +7 -3
src/app.py +167 -0
src/components/__init__.py +1 -0
src/components/chat_interface.py +120 -0
src/components/file_uploader.py +27 -0
src/components/pdf_viewer.py +57 -0
src/models/__init__.py +0 -0
src/models/chat_models.py +21 -0
src/services/__init__.py +0 -0
src/services/embedding_service.py +140 -0
src/services/gemini_client.py +77 -0
src/services/pdf_processor.py +94 -0
src/services/rag_service.py +105 -0
src/streamlit_app.py +0 -40
src/utils/__init__.py +0 -0
src/utils/chunking.py +37 -0
src/utils/vector_store.py +101 -0

Dockerfile DELETED Viewed

@@ -1,20 +0,0 @@
-FROM python:3.13.5-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,19 +1,66 @@
 ---
-title: Pdf Chat Assistant
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
 pinned: false
-short_description: A Streamlit web app that lets you chat with your PDF
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
+title: PDF Chat Assistant
+emoji: 📄
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+app_file: src/app.py
 pinned: false
 ---
+# PDF Chat Assistant
+Interact with your PDF using Retrieval-Augmented Generation (RAG) + Gemini.
+Upload a PDF, it is chunked, embedded, and you can ask questions with contextual, streamed answers.
+## Features
+- PDF upload & inline preview
+- Automatic text extraction, cleaning, chunking
+- Embedding storage (pickle vector store)
+- Similarity-based context retrieval
+- Gemini response generation (streaming)
+- Scrollable chat UI
+## Conda Setup
+```bash
+git clone https://github.com/Seif-aber/pdf_chat_assistant
+cd pdf-chat-assistant
+# Create environment
+conda create -n pdfchat python=3.12 -y
+conda activate pdfchat
+# Install dependencies
+pip install -r requirements.txt
+```
+## Environment Variables
+Create a `.env` file in project root:
+```
+GEMINI_API_KEY=your_key_here
+GEMINI_MODEL=gemini-2.5-flash
+EMBEDDING_MODEL=models/embedding-001
+STREAMLIT_PORT=8501
+MAX_PDF_SIZE_MB=10
+CHUNK_SIZE=1000
+CHUNK_OVERLAP=200
+UPLOAD_FOLDER=data/uploads
+EMBEDDINGS_FOLDER=data/embeddings
+```
+Then:
+```bash
+streamlit run src/app.py --server.port $STREAMLIT_PORT
+```
+## How It Works
+1. Upload PDF → saved to a temp file.
+2. Text extracted (PyPDF2 / pypdf fallback) and chunked with overlap.
+3. Each chunk embedded via Gemini Embeddings API.
+4. On question: create query embedding → cosine similarity → top chunks form context.
+5. Gemini model generates constrained to context.

config/__init__.py ADDED Viewed

File without changes

config/settings.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""Central application configuration"""
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent
+_DOTENV_PATH = _PROJECT_ROOT / ".env"
+load_dotenv(dotenv_path=_DOTENV_PATH, override=False)
+class Config:
+    """Holds application configuration values loaded from .env only (no silent fallbacks)."""
+    GEMINI_API_KEY: str | None = os.getenv("GEMINI_API_KEY")
+    GEMINI_MODEL: str | None = os.getenv("GEMINI_MODEL")
+    EMBEDDING_MODEL: str | None = os.getenv("EMBEDDING_MODEL")
+    STREAMLIT_PORT: str | None = os.getenv("STREAMLIT_PORT")
+    MAX_PDF_SIZE_MB: str | None = os.getenv("MAX_PDF_SIZE_MB")
+    CHUNK_SIZE: str | None = os.getenv("CHUNK_SIZE")
+    CHUNK_OVERLAP: str | None = os.getenv("CHUNK_OVERLAP")
+    UPLOAD_FOLDER: str | None = os.getenv("UPLOAD_FOLDER")
+    EMBEDDINGS_FOLDER: str | None = os.getenv("EMBEDDINGS_FOLDER")
+    EMBEDDING_STORAGE_PATH: str | None = None
+    @classmethod
+    def validate(cls) -> None:
+        """Validate required variables & finalize derived values."""
+        required = {
+            "GEMINI_API_KEY": cls.GEMINI_API_KEY,
+            "GEMINI_MODEL": cls.GEMINI_MODEL,
+            "EMBEDDING_MODEL": cls.EMBEDDING_MODEL,
+            "STREAMLIT_PORT": cls.STREAMLIT_PORT,
+            "MAX_PDF_SIZE_MB": cls.MAX_PDF_SIZE_MB,
+            "CHUNK_SIZE": cls.CHUNK_SIZE,
+            "CHUNK_OVERLAP": cls.CHUNK_OVERLAP,
+            "UPLOAD_FOLDER": cls.UPLOAD_FOLDER,
+            "EMBEDDINGS_FOLDER": cls.EMBEDDINGS_FOLDER,
+        }
+        missing = [k for k, v in required.items() if not v]
+        if missing:
+            raise ValueError(f"Missing required environment variables in .env: {', '.join(missing)}")
+        cls.STREAMLIT_PORT = int(cls.STREAMLIT_PORT)
+        cls.MAX_PDF_SIZE_MB = int(cls.MAX_PDF_SIZE_MB)
+        cls.CHUNK_SIZE = int(cls.CHUNK_SIZE)
+        cls.CHUNK_OVERLAP = int(cls.CHUNK_OVERLAP)
+        os.makedirs(cls.UPLOAD_FOLDER, exist_ok=True)
+        os.makedirs(cls.EMBEDDINGS_FOLDER, exist_ok=True)
+        cls.EMBEDDING_STORAGE_PATH = os.path.join(cls.EMBEDDINGS_FOLDER, "pdf_embeddings.pkl")  # type: ignore

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
-altair
-pandas
-streamlit

+streamlit
+PyPDF2
+pypdf
+numpy
+google-generativeai>=0.7.0
+python-dotenv
+pydantic

src/app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import streamlit as st
+import os
+import tempfile
+import hashlib
+from components.file_uploader import FileUploader
+from components.pdf_viewer import PdfViewer
+from components.chat_interface import ChatInterface
+from services.pdf_processor import PDFProcessor
+from services.embedding_service import EmbeddingService
+from services.gemini_client import GeminiClient
+from services.rag_service import RAGService
+def initialize_session_state():
+    defaults = {
+        "chat_history": [],
+        "pdf_processed": False,
+        "pdf_id": None,
+        "pdf_chunks": [],
+        "uploaded_file_path": None,
+        "current_file_name": None,
+        "current_file_hash": None,
+        "processing": False,
+        "streaming": False,
+        "chat_input": "",
+        "clear_chat_input": False,   # <--- new flag
+    }
+    for k, v in defaults.items():
+        if k not in st.session_state:
+            st.session_state[k] = v
+def reset_app_state(embedding_service: EmbeddingService) -> None:
+    old_path = st.session_state.get("uploaded_file_path")
+    if old_path and os.path.exists(old_path):
+        try: os.unlink(old_path)
+        except Exception: pass
+    try:
+        embedding_service.vector_store.clear_embeddings()
+    except Exception:
+        pass
+    st.session_state.chat_history = []
+    st.session_state.pdf_processed = False
+    st.session_state.pdf_id = None
+    st.session_state.pdf_chunks = []
+    st.session_state.uploaded_file_path = None
+    st.session_state.current_file_name = None
+    st.session_state.current_file_hash = None
+    st.session_state.processing = False
+    st.session_state.streaming = False
+def _file_hash(uploaded_file) -> str:
+    return hashlib.md5(uploaded_file.getvalue()).hexdigest()
+def auto_process_pdf(uploaded_file, tmp_file_path, embedding_service: EmbeddingService, force: bool = False):
+    if st.session_state.processing:
+        return
+    if st.session_state.pdf_processed and not force:
+        return
+    st.session_state.processing = True
+    status = st.empty()
+    try:
+        status.markdown("⏳ Processing PDF... 10%")
+        pdf_processor = PDFProcessor()
+        chunks = pdf_processor.process_pdf(tmp_file_path)
+        if not chunks:
+            status.error("Failed to extract text.")
+            return
+        pdf_id = uploaded_file.name.replace(".pdf","").replace(" ","_").replace(".","_")
+        st.session_state.pdf_id = pdf_id
+        st.session_state.pdf_chunks = chunks
+        status.markdown("⏳ Processing PDF... 50%")
+        embedding_service.store_pdf_embeddings(pdf_id, chunks)
+        status.markdown("⏳ Processing PDF... 90%")
+        st.session_state.pdf_processed = True
+        status.success(f"✅ Processing complete (100%). {len(chunks)} chunks ready.")
+    except Exception as e:
+        status.error(f"❌ Error: {e}")
+    finally:
+        st.session_state.processing = False
+def main():
+    st.set_page_config(page_title="PDF Chat Assistant", page_icon="📄", layout="wide")
+    st.title("📄 PDF Chat Assistant")
+    initialize_session_state()
+    embedding_service = EmbeddingService()
+    gemini_client = GeminiClient()
+    rag_service = RAGService(embedding_service, gemini_client)
+    col1, col2 = st.columns([1,1])
+    with col1:
+        st.header("📁 Upload & Preview PDF")
+        uploaded_file = FileUploader().upload_file()
+        if uploaded_file:
+            new_hash = _file_hash(uploaded_file)
+            if st.session_state.current_file_hash and st.session_state.current_file_hash != new_hash:
+                reset_app_state(embedding_service)
+            if st.session_state.current_file_hash != new_hash:
+                st.session_state.current_file_name = uploaded_file.name
+                st.session_state.current_file_hash = new_hash
+                st.session_state.pdf_processed = False
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                tmp.write(uploaded_file.getvalue())
+                tmp_path = tmp.name
+            st.session_state.uploaded_file_path = tmp_path
+            PdfViewer().display_pdf(tmp_path)
+            auto_process_pdf(uploaded_file, tmp_path, embedding_service)
+            if st.session_state.pdf_processed and not st.session_state.processing:
+                if st.button("🔄 Reprocess PDF"):
+                    st.session_state.pdf_processed = False
+                    auto_process_pdf(uploaded_file, tmp_path, embedding_service, force=True)
+        else:
+            st.info("Upload a PDF to begin.")
+    with col2:
+        st.header("💬 Chat with your PDF")
+        if st.session_state.processing:
+            st.info("⏳ Processing... Please wait.")
+            return
+        chat_ui = ChatInterface()
+        if st.session_state.pdf_processed and st.session_state.pdf_id:
+            if st.session_state.clear_chat_input:
+                st.session_state.chat_input = ""
+                st.session_state.clear_chat_input = False
+            chat_ui.render(st.session_state.chat_history)
+            disabled = st.session_state.streaming
+            user_input = st.text_input(
+                "Ask a question:",
+                key="chat_input",
+                placeholder="Type your question...",
+                disabled=disabled,
+                label_visibility="collapsed"
+            )
+            send = st.button("Send", disabled=disabled or not user_input.strip(), use_container_width=True)
+            if send and user_input.strip():
+                query = user_input.strip()
+                st.session_state.chat_history.append({"role": "user", "content": query})
+                st.session_state.streaming = True
+                st.session_state.clear_chat_input = True
+                stream_iter = rag_service.stream_response(
+                    query,
+                    st.session_state.pdf_id,
+                    st.session_state.chat_history
+                )
+                assistant_text = chat_ui.stream_assistant(st.session_state.chat_history, stream_iter)
+                st.session_state.chat_history.append({"role": "assistant", "content": assistant_text})
+                st.session_state.streaming = False
+                st.rerun()
+            col_a, col_b = st.columns([1,1])
+            with col_a:
+                if st.button("Clear Chat", disabled=st.session_state.streaming):
+                    st.session_state.chat_history = []
+                    st.session_state.clear_chat_input = True
+                    st.rerun()
+            with col_b:
+                pass
+        else:
+            st.info("Upload and wait for processing to chat.")
+if __name__ == "__main__":
+    main()

src/components/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is intentionally left blank.

src/components/chat_interface.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""Scrollable + streaming chat interface."""
+import streamlit as st
+from typing import List, Dict
+import html
+import time
+_CHAT_CSS = """
+<style>
+#chat-container {
+  height: 520px;
+  overflow-y: auto;
+  padding: 0.5rem 0.75rem 0.25rem 0.75rem;
+  border: 1px solid #e3e3e3;
+  border-radius: 10px;
+  background: #fafafa;
+  scroll-behavior: smooth;
+}
+.chat-msg { margin: 0 0 14px 0; max-width: 85%; }
+.chat-row-user { display:flex; justify-content:flex-end; }
+.chat-row-assistant { display:flex; justify-content:flex-start; }
+.bubble {
+  padding:10px 14px;
+  border-radius:14px;
+  line-height:1.35;
+  font-size:0.93rem;
+  box-shadow:0 1px 2px rgba(0,0,0,0.08);
+  word-wrap:break-word;
+  white-space:pre-wrap;
+}
+.bubble-user {
+  background:linear-gradient(135deg,#4b8df8,#2563eb);
+  color:#fff;
+  border-bottom-right-radius:4px;
+}
+.bubble-assistant {
+  background:#ffffff;
+  border:1px solid #ddd;
+  border-bottom-left-radius:4px;
+}
+.meta {
+  font-size:0.6rem;
+  opacity:0.55;
+  margin-top:4px;
+  text-align:right;
+  user-select:none;
+}
+</style>
+<script>
+function scrollChat(){
+  const el = window.parent.document.querySelector('#chat-container');
+  if(el){ el.scrollTop = el.scrollHeight; }
+}
+</script>
+"""
+class ChatInterface:
+    """Renders scrollable chat and supports streaming assistant output."""
+    def __init__(self):
+        self.chat_history = []
+    def render(self, chat_history: List[Dict]) -> None:
+        st.markdown(_CHAT_CSS, unsafe_allow_html=True)
+        if not chat_history:
+            st.info("No messages yet. Ask something about the PDF.")
+            return
+        st.markdown(self._history_to_html(chat_history), unsafe_allow_html=True)
+    def stream_assistant(self, chat_history: List[Dict], stream_iter) -> str:
+        """
+        Render existing messages then stream new assistant message.
+        Returns final assistant text.
+        """
+        st.markdown(_CHAT_CSS, unsafe_allow_html=True)
+        placeholder = st.empty()
+        assistant_text = ""
+        # Re-render on each chunk for smooth streaming
+        for chunk in stream_iter:
+            assistant_text += chunk
+            merged = chat_history + [{"role": "assistant", "content": assistant_text}]
+            placeholder.markdown(self._history_to_html(merged), unsafe_allow_html=True)
+            time.sleep(0.03)
+        return assistant_text
+    def input_box(self, key: str = "chat_input") -> str:
+        return st.text_input(
+            "Ask a question:",
+            key=key,
+            placeholder="Type your question and press Enter...",
+            label_visibility="collapsed",
+        )
+    def add_message(self, role: str, content: str):
+        """
+        Add a message to chat history
+        """
+        self.chat_history.append({
+            "role": role,
+            "content": content
+        })
+    def clear_chat(self):
+        """Clear the chat history"""
+        self.chat_history = []
+    def _history_to_html(self, history: List[Dict]) -> str:
+        rows = []
+        for m in history:
+            role = m.get("role", "user")
+            safe = html.escape(m.get("content", ""))
+            row_cls = "chat-row-user" if role == "user" else "chat-row-assistant"
+            bub_cls = "bubble bubble-user" if role == "user" else "bubble bubble-assistant"
+            label = "You" if role == "user" else "Assistant"
+            rows.append(
+                f'<div class="{row_cls}"><div class="chat-msg">'
+                f'<div class="{bub_cls}">{safe}</div>'
+                f'<div class="meta">{label}</div>'
+                f'</div></div>'
+            )
+        return f'<div id="chat-container">{"".join(rows)}</div><script>scrollChat();</script>'

src/components/file_uploader.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Streamlit component: PDF file uploader."""
+import streamlit as st
+from typing import Optional
+UploadedFile = "UploadedFile"
+class FileUploader:
+    """Encapsulates upload widget usage."""
+    def __init__(self) -> None:
+        """Initialize with no uploaded file."""
+        self.uploaded_file: Optional[st.runtime.uploaded_file_manager.UploadedFile] = None
+    def upload_file(self) -> Optional[st.runtime.uploaded_file_manager.UploadedFile]:
+        """
+        Render uploader and return uploaded file.
+        """
+        self.uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
+        if self.uploaded_file:
+            return self.uploaded_file
+        return None
+    def get_file_content(self) -> Optional[bytes]:
+        """
+        Return raw bytes of uploaded file.
+        """
+        return self.uploaded_file.getvalue() if self.uploaded_file else None

src/components/pdf_viewer.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Embed a PDF file in the Streamlit UI (base64 iframe fallback)."""
+import streamlit as st
+import base64
+import os
+from PyPDF2 import PdfReader
+class PdfViewer:
+    """Display a PDF document inside the app."""
+    def display_pdf(self, pdf_path: str) -> None:
+        """
+        Render only the PDF iframe (metrics removed).
+        Args:
+            pdf_path: Path to local PDF file.
+        """
+        try:
+            self._iframe(pdf_path)
+        except Exception as e:
+            st.error(f"PDF preview error: {e}")
+    def _iframe(self, pdf_path: str) -> None:
+        """
+        Create a base64 iframe embed.
+        Args:
+            pdf_path: Path to PDF.
+        """
+        try:
+            with open(pdf_path, "rb") as f:
+                data = f.read()
+            b64 = base64.b64encode(data).decode("utf-8")
+            html = f"""
+            <div style="width:100%; height:600px; border:1px solid #ddd; border-radius:4px; overflow:hidden;">
+              <iframe src="data:application/pdf;base64,{b64}" width="100%" height="100%" style="border:none;"></iframe>
+            </div>
+            """
+            st.markdown(html, unsafe_allow_html=True)
+            st.download_button("📥 Download PDF", data, file_name=os.path.basename(pdf_path), mime="application/pdf")
+        except Exception as e:
+            st.warning(f"Inline PDF display failed: {e}")
+    def _info(self, pdf_path: str) -> dict:
+        """
+        Collect minimal PDF info (retained for potential future use).
+        Args:
+            pdf_path: Path to PDF.
+        Returns:
+            Dict with num_pages & encrypted flag.
+        """
+        try:
+            reader = PdfReader(pdf_path)
+            return {"num_pages": len(reader.pages), "encrypted": reader.is_encrypted}
+        except Exception:
+            return {"num_pages": 0, "encrypted": False}

src/models/__init__.py ADDED Viewed

File without changes

src/models/chat_models.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from pydantic import BaseModel
+from typing import List, Optional
+class Message(BaseModel):
+    user_id: str
+    content: str
+    timestamp: str
+class ChatContext(BaseModel):
+    messages: List[Message]
+    pdf_id: Optional[str] = None
+class UserPrompt(BaseModel):
+    user_id: str
+    prompt: str
+    context: ChatContext
+class AssistantResponse(BaseModel):
+    response: str
+    context: ChatContext
+    pdf_id: Optional[str] = None

src/services/__init__.py ADDED Viewed

File without changes

src/services/embedding_service.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""Generate, store, and query embeddings via Gemini API."""
+import numpy as np
+import google.generativeai as genai
+from typing import List, Dict, Optional
+from config.settings import Config
+from src.utils.vector_store import VectorStore
+class EmbeddingService:
+    """Handles embedding generation, storage, and similarity search."""
+    def __init__(self) -> None:
+        """Configure Gemini and initialize vector store."""
+        Config.validate()
+        genai.configure(api_key=Config.GEMINI_API_KEY)
+        self.embedding_model = Config.EMBEDDING_MODEL
+        self.vector_store = VectorStore(storage_path=Config.EMBEDDING_STORAGE_PATH)
+    def generate_embeddings(self, texts: List[str]) -> List[np.ndarray]:
+        """
+        Embed a list of document texts.
+        Args:
+            texts: List of strings.
+        Returns:
+            List of embedding vectors (np.ndarray).
+        """
+        embeddings: List[np.ndarray] = []
+        for i, text in enumerate(texts):
+            try:
+                result = genai.embed_content(
+                    model=self.embedding_model,
+                    content=text,
+                    task_type="retrieval_document",
+                )
+                embeddings.append(np.array(result["embedding"]))
+            except Exception as e:
+                print(f"[EmbeddingService] Doc embed error idx {i}: {e}")
+                embeddings.append(np.zeros(768))
+        return embeddings
+    def generate_query_embedding(self, query: str) -> np.ndarray:
+        """
+        Create an embedding for a query.
+        Args:
+            query: User query text.
+        Returns:
+            Query embedding vector.
+        """
+        try:
+            result = genai.embed_content(
+                model=self.embedding_model,
+                content=query,
+                task_type="retrieval_query",
+            )
+            return np.array(result["embedding"])
+        except Exception as e:
+            print(f"[EmbeddingService] Query embed error: {e}")
+            return np.zeros(768)
+    def store_pdf_embeddings(self, pdf_id: str, chunks: List[str]) -> None:
+        """
+        Embed and store all chunks for a PDF (replacing previous).
+        Args:
+            pdf_id: Unique PDF identifier.
+            chunks: List of chunk strings.
+        """
+        self.clear_pdf_embeddings(pdf_id)
+        for idx, (chunk, vec) in enumerate(zip(chunks, self.generate_embeddings(chunks))):
+            key = f"{pdf_id}_chunk_{idx}"
+            self.vector_store.add_embedding(
+                key=key,
+                vector=vec.tolist(),
+                metadata={"pdf_id": pdf_id, "chunk_index": idx, "text": chunk},
+            )
+    def find_similar_chunks(self, query: str, pdf_id: Optional[str] = None, top_k: int = 3) -> List[Dict]:
+        """
+        Retrieve top_k most similar stored chunks.
+        Args:
+            query: User query string.
+            pdf_id: Restrict to given PDF id if set.
+            top_k: Number of results.
+        Returns:
+            List of similarity result dicts.
+        """
+        q_vec = self.generate_query_embedding(query)
+        results = []
+        for key in self.vector_store.get_all_embeddings():
+            if pdf_id and not key.startswith(f"{pdf_id}_"):
+                continue
+            data = self.vector_store.get_embedding_data(key)
+            if not data:
+                continue
+            vec = np.array(data["vector"])
+            sim = self._cosine_similarity(q_vec, vec)
+            md = data.get("metadata", {})
+            results.append(
+                {
+                    "key": key,
+                    "similarity": sim,
+                    "text": md.get("text", ""),
+                    "chunk_index": md.get("chunk_index", 0),
+                    "pdf_id": md.get("pdf_id", ""),
+                }
+            )
+        results.sort(key=lambda r: r["similarity"], reverse=True)
+        return results[:top_k]
+    def clear_pdf_embeddings(self, pdf_id: str) -> None:
+        """
+        Remove all embeddings tied to a PDF.
+        Args:
+            pdf_id: Identifier.
+        """
+        self.vector_store.remove_embeddings_by_prefix(f"{pdf_id}_")
+    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
+        """
+        Compute cosine similarity.
+        Args:
+            a: Vector A
+            b: Vector B
+        Returns:
+            Cosine similarity or 0.0 on failure.
+        """
+        if not a.any() or not b.any():
+            return 0.0
+        denom = (np.linalg.norm(a) * np.linalg.norm(b))
+        if denom == 0:
+            return 0.0
+        return float(np.dot(a, b) / denom)

src/services/gemini_client.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import sys
+import os
+import google.generativeai as genai
+from typing import List, Dict, Optional, Iterator
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+from config.settings import Config
+class GeminiClient:
+    """Generate responses (full or streaming) using Gemini with optional context & history."""
+    def __init__(self) -> None:
+        """Configure model instance."""
+        Config.validate()
+        genai.configure(api_key=Config.GEMINI_API_KEY)
+        self.model = genai.GenerativeModel(Config.GEMINI_MODEL)
+    def generate_response(self, prompt: str, context: str = "", chat_history: Optional[List[Dict]] = None) -> str:
+        """
+        Produce a model response.
+        Args:
+            prompt: User question.
+            context: Retrieved PDF context.
+            chat_history: Prior messages list.
+        Returns:
+            Response string (or error message).
+        """
+        try:
+            full_prompt = self._build_prompt(prompt, context, chat_history)
+            resp = self.model.generate_content(full_prompt)
+            return getattr(resp, "text", "").strip() or "No response generated."
+        except Exception as e:
+            return f"Error generating response: {e}"
+    def stream_response(self, prompt: str, context: str = "", chat_history: Optional[List[Dict]] = None) -> Iterator[str]:
+        """
+        Stream model tokens/chunks. Yields incremental text fragments.
+        """
+        try:
+            full_prompt = self._build_prompt(prompt, context, chat_history)
+            for chunk in self.model.generate_content(full_prompt, stream=True):
+                txt = getattr(chunk, "text", "")
+                if txt:
+                    yield txt
+        except Exception as e:
+            yield f"[Error] {e}"
+    def _build_prompt(self, user_prompt: str, context: str, chat_history: Optional[List[Dict]]) -> str:
+        """
+        Construct final prompt sent to LLM.
+        Args:
+            user_prompt: Current question.
+            context: Retrieved context text.
+            chat_history: List of previous user/assistant dicts.
+        Returns:
+            Combined prompt string.
+        """
+        system = (
+            "You are an assistant answering questions about an uploaded PDF. "
+            "Base answers only on provided context. If unknown, say you lack the info."
+        )
+        parts = [system]
+        if context:
+            parts.append(f"\nContext:\n{context}")
+        if chat_history:
+            parts.append("\nRecent conversation:")
+            for m in chat_history[-5:]:
+                role = m.get("role", "user")
+                content = m.get("content", "")
+                parts.append(f"{role}: {content}")
+        parts.append(f"\nQuestion: {user_prompt}\nAnswer:")
+        return "\n".join(parts)

src/services/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import sys
+import os
+from typing import List, Dict, Optional
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+try:
+    from PyPDF2 import PdfReader
+except ImportError:
+    try:
+        from pypdf import PdfReader
+    except ImportError:
+        print("Error: PDF reading library not found. Please install PyPDF2 or pypdf.")
+        PdfReader = None
+from src.utils.chunking import chunk_pdf_text, clean_text
+from config.settings import Config
+class PDFProcessor:
+    """Process PDFs into cleaned text chunks."""
+    def __init__(self, chunk_size: Optional[int] = None, overlap: Optional[int] = None) -> None:
+        """
+        Initialize processor with chunk parameters.
+        Args:
+            chunk_size: Characters per chunk (defaults to config).
+            overlap: Overlap between chunks (defaults to config).
+        """
+        self.chunk_size = chunk_size or Config.CHUNK_SIZE
+        self.overlap = overlap or Config.CHUNK_OVERLAP
+    def process_pdf(self, file_path: str) -> List[str]:
+        """
+        Read PDF, extract text, clean, and chunk.
+        Args:
+            file_path: Path to PDF.
+        Returns:
+            List of chunk strings.
+        """
+        raw = self._extract_text(file_path)
+        if not raw.strip():
+            return []
+        cleaned = clean_text(raw)
+        chunks = chunk_pdf_text(cleaned, self.chunk_size, self.overlap)
+        return [c for c in chunks if len(c.strip()) > 50]
+    def get_pdf_info(self, file_path: str) -> Dict:
+        """
+        Retrieve simple info (pages, metadata, encryption).
+        Args:
+            file_path: Path to PDF.
+        Returns:
+            Dict of info.
+        """
+        try:
+            reader = PdfReader(file_path)
+            return {
+                "num_pages": len(reader.pages),
+                "metadata": reader.metadata,
+                "encrypted": reader.is_encrypted,
+            }
+        except Exception as e:
+            print(f"[PDFProcessor] Info error: {e}")
+            return {}
+    def _extract_text(self, file_path: str) -> str:
+        """
+        Extract text from all pages.
+        Args:
+            file_path: Path to PDF.
+        Returns:
+            Concatenated text with page separators.
+        """
+        try:
+            reader = PdfReader(file_path)
+            out: List[str] = []
+            for idx, page in enumerate(reader.pages):
+                try:
+                    text = page.extract_text() or ""
+                    if text.strip():
+                        out.append(f"\n--- Page {idx+1} ---\n{text}")
+                except Exception as pe:
+                    print(f"[PDFProcessor] Page {idx+1} extraction failed: {pe}")
+            return "".join(out)
+        except Exception as e:
+            print(f"[PDFProcessor] Read error: {e}")
+            return ""

src/services/rag_service.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import sys
+import os
+from typing import List, Dict, Optional, Iterator
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+from src.services.embedding_service import EmbeddingService
+from src.services.gemini_client import GeminiClient
+class RAGService:
+    """Combine retrieval + generation workflow."""
+    def __init__(self, embedding_service: EmbeddingService, gemini_client: GeminiClient) -> None:
+        """
+        Init RAG service.
+        Args:
+            embedding_service: EmbeddingService instance.
+            gemini_client: GeminiClient instance.
+        """
+        self.embedding_service = embedding_service
+        self.gemini_client = gemini_client
+    def get_response(self, user_query: str, pdf_id: str, chat_history: Optional[List[Dict]] = None) -> str:
+        """
+        Retrieve context & generate answer.
+        Args:
+            user_query: User question.
+            pdf_id: PDF identifier.
+            chat_history: Prior messages.
+        Returns:
+            Assistant answer text.
+        """
+        chunks = self.embedding_service.find_similar_chunks(user_query, pdf_id=pdf_id, top_k=3)
+        context = self._format_context(chunks)
+        return self.gemini_client.generate_response(user_query, context=context, chat_history=chat_history)
+    def stream_response(self, user_query: str, pdf_id: str, chat_history: Optional[List[Dict]] = None) -> Iterator[str]:
+        """
+        Retrieve context then stream model output.
+        """
+        chunks = self.embedding_service.find_similar_chunks(user_query, pdf_id=pdf_id, top_k=3)
+        context = self._format_context(chunks)
+        return self.gemini_client.stream_response(user_query, context=context, chat_history=chat_history)
+    def _format_context(self, chunks: List[Dict]) -> str:
+        """
+        Format retrieved chunks for prompt.
+        Args:
+            chunks: Retrieval result list.
+        Returns:
+            Joined context string.
+        """
+        if not chunks:
+            return ""
+        lines: List[str] = []
+        for idx, c in enumerate(chunks, start=1):
+            if c.get("similarity", 0) > 0.05:
+                lines.append(f"[Chunk {idx} sim={c['similarity']:.2f}]\n{c.get('text','')}")
+        return "\n\n".join(lines)
+    def retrieve_relevant_chunks(self, user_prompt: str, pdf_id: str, top_k: int = 3) -> List[Dict]:
+        """
+        Retrieve relevant chunks based on user prompt
+        """
+        return self.embedding_service.find_similar_chunks(
+            query=user_prompt,
+            pdf_id=pdf_id,
+            top_k=top_k
+        )
+    def generate_response_with_sources(self, user_query: str, pdf_id: str, chat_history: List[Dict] = None) -> Dict:
+        """
+        Generate response with source information
+        """
+        try:
+            # Retrieve relevant chunks
+            relevant_chunks = self.retrieve_relevant_chunks(user_query, pdf_id)
+            # Prepare context
+            context = self._format_context(relevant_chunks)
+            # Generate response
+            response = self.gemini_client.generate_response(
+                prompt=user_query,
+                context=context,
+                chat_history=chat_history
+            )
+            return {
+                "response": response,
+                "sources": relevant_chunks,
+                "context_used": context
+            }
+        except Exception as e:
+            return {
+                "response": f"Sorry, I encountered an error: {str(e)}",
+                "sources": [],
+                "context_used": ""
+            }

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/chunking.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Basic text cleaning and fixed-size overlapping chunking utilities."""
+from typing import List
+def clean_text(text: str) -> str:
+    """
+    Normalize whitespace in text.
+    Args:
+        text: Raw text.
+    Returns:
+        Cleaned single-spaced text.
+    """
+    return " ".join(text.split())
+def chunk_pdf_text(pdf_text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
+    """
+    Split text into overlapping chunks.
+    Args:
+        pdf_text: Full text.
+        chunk_size: Max chars per chunk.
+        overlap: Overlapping chars between chunks.
+    Returns:
+        List of chunk strings.
+    """
+    if chunk_size <= overlap:
+        raise ValueError("chunk_size must be greater than overlap")
+    chunks: List[str] = []
+    start = 0
+    length = len(pdf_text)
+    while start < length:
+        end = min(start + chunk_size, length)
+        chunks.append(pdf_text[start:end])
+        start += chunk_size - overlap
+    return chunks

src/utils/vector_store.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""Lightweight on-disk key → embedding store (pickle-based)."""
+from __future__ import annotations
+from typing import List, Dict, Optional, Any
+import pickle
+import os
+class VectorStore:
+    """Persist simple embedding entries (vector + metadata) to a pickle file."""
+    def __init__(self, storage_path: str) -> None:
+        """
+        Initialize the vector store.
+        Args:
+            storage_path: Path to pickle file used for persistence.
+        """
+        self.storage_path = storage_path
+        self.embeddings: Dict[str, Dict[str, Any]] = {}
+        os.makedirs(os.path.dirname(storage_path), exist_ok=True)
+        self.load_embeddings()
+    def load_embeddings(self) -> None:
+        """Load embeddings from disk if file exists."""
+        if os.path.exists(self.storage_path):
+            try:
+                with open(self.storage_path, "rb") as f:
+                    self.embeddings = pickle.load(f)
+            except Exception as e:
+                print(f"[VectorStore] Error loading embeddings: {e}")
+                self.embeddings = {}
+    def save_embeddings(self) -> None:
+        """Persist current embeddings to disk."""
+        try:
+            with open(self.storage_path, "wb") as f:
+                pickle.dump(self.embeddings, f)
+        except Exception as e:
+            print(f"[VectorStore] Error saving embeddings: {e}")
+    def add_embedding(self, key: str, vector: List[float], metadata: Optional[Dict] = None) -> None:
+        """
+        Add or overwrite an embedding entry.
+        Args:
+            key: Unique identifier (e.g. 'pdf1_chunk_0')
+            vector: Embedding vector as list of floats
+            metadata: Optional metadata dictionary
+        """
+        self.embeddings[key] = {"vector": vector, "metadata": metadata or {}}
+        self.save_embeddings()
+    def get_embedding_data(self, key: str) -> Optional[Dict]:
+        """
+        Retrieve full embedding entry.
+        Args:
+            key: Embedding key
+        Returns:
+            Dict with 'vector' and 'metadata' or None.
+        """
+        return self.embeddings.get(key)
+    def get_embedding_vector(self, key: str) -> Optional[List[float]]:
+        """
+        Retrieve only the vector.
+        Args:
+            key: Embedding key
+        Returns:
+            Vector list or None.
+        """
+        entry = self.embeddings.get(key)
+        return entry["vector"] if entry else None
+    def get_all_embeddings(self) -> List[str]:
+        """
+        List all embedding keys.
+        Returns:
+            List of keys.
+        """
+        return list(self.embeddings.keys())
+    def clear_embeddings(self) -> None:
+        """Remove all embeddings."""
+        self.embeddings = {}
+        self.save_embeddings()
+    def remove_embeddings_by_prefix(self, prefix: str) -> None:
+        """
+        Remove embeddings whose keys start with prefix.
+        Args:
+            prefix: Key prefix filter.
+        """
+        to_remove = [k for k in self.embeddings if k.startswith(prefix)]
+        for k in to_remove:
+            del self.embeddings[k]
+        self.save_embeddings()