Spaces:

BinKhoaLe1812
/

EdSummariser

Sleeping

App Files Files Community

LiamKhoaLe commited on Aug 27

Commit

729a1f7

0 Parent(s):

Initial commit

Browse files

Files changed (23) hide show

.gitignore +1 -0
Dockerfile +50 -0
LICENSE.txt +201 -0
README.md +253 -0
app.py +303 -0
dw_model.py +49 -0
memo/history.py +134 -0
memo/memory.py +32 -0
requirements.txt +13 -0
static/index.html +47 -0
static/script.js +72 -0
static/styles.css +66 -0
utils/caption.py +41 -0
utils/chunker.py +85 -0
utils/common.py +20 -0
utils/embeddings.py +34 -0
utils/logger.py +38 -0
utils/parser.py +53 -0
utils/rag.py +132 -0
utils/rotator.py +61 -0
utils/router.py +83 -0
utils/summarizer.py +19 -0
warmup.py +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# Hugging Face Spaces - Docker
+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential curl git libglib2.0-0 libgl1 \
+    && rm -rf /var/lib/apt/lists/*
+# Create and use a non-root user
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
+WORKDIR /app
+# Copy project files
+COPY . .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Hugging Face cache directories
+ENV HF_HOME="/home/user/.cache/huggingface"
+ENV SENTENCE_TRANSFORMERS_HOME="/home/user/.cache/huggingface/sentence-transformers"
+ENV MEDGEMMA_HOME="/home/user/.cache/huggingface/sentence-transformers"
+# Create cache directories and set permissions
+RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformers && \
+    chown -R user:user /app/model_cache /home/user/.cache/huggingface
+# Control preloading flags
+ENV PRELOAD_TRANSLATORS="0"
+ENV EMBEDDING_HALF="0"
+# Preload embedding model and warmup
+RUN python /app/dw_model.py && python /app/warmup.py
+# Ensure ownership stays correct
+RUN chown -R user:user /app/model_cache
+# Expose port for HF Spaces
+ENV PORT=7860
+EXPOSE 7860
+# Start FastAPI
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2025] [Dang Khoa Le]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,253 @@

+---
+title: EdSummariser
+emoji: 📚
+colorFrom: inigo
+colorTo: blue
+sdk: docker
+sdk_version: latest
+pinned: false
+license: apache-2.0
+short_description: Ed-Assistant summary your learning journey with Agentic RAG
+---
+### StudyBuddy RAG
+An end-to-end RAG (Retrieval-Augmented Generation) app for studying from your own documents. Upload PDF/DOCX files, the app extracts text and images, captions images, chunks into semantic "cards", embeds and stores them in MongoDB, and serves a chat endpoint that answers strictly from your uploaded materials. Includes a lightweight chat-memory feature to improve context continuity, cost-aware model routing, and robust provider retries.
+## Features
+- **Document ingestion**: PDF/DOCX parsing (PyMuPDF, python-docx), image extraction and BLIP-based captions
+- **Semantic chunking**: heuristic heading/size-based chunker
+- **Embeddings**: Sentence-Transformers (all-MiniLM-L6-v2 by default) with random fallback when unavailable
+- **Vector search**: MongoDB Atlas Vector Search (optional) or local cosine fallback
+- **RAG chat**: cost-aware routing between Gemini and NVIDIA endpoints
+- **Chat memory**: per-user LRU of recent QA summaries; history and semantic retrieval to augment context
+- **Summarization**: cheap extractive summaries via sumy with naive fallback
+- **Centralized logging**: tagged loggers per module, e.g., [APP], [RAG], [CHUNKER]
+- **Simple UI**: static frontend under `static/`
+## Prerequisites
+- Python 3.10+
+- MongoDB instance (local or Atlas). Collections are created automatically
+- Optional: NVIDIA and/or Gemini API keys for model calls
+- Optional but recommended: a virtual environment
+## Project Structure
+```text
+app.py                       # FastAPI app, routes, background ingestion, chat
+utils/logger.py              # Centralized tagged logger
+utils/parser.py              # PDF/DOCX parsing and image extraction
+utils/caption.py             # BLIP image captioning (transformers)
+utils/chunker.py             # Heuristic chunk builder
+utils/embeddings.py          # Embedding client (Sentence-Transformers)
+utils/rag.py                 # Mongo-backed store and vector search
+utils/rotator.py             # API key rotator + robust HTTP POST helper
+utils/router.py              # Model selection + LLM invocation helpers
+utils/summarizer.py          # sumy-based extractive summarizer
+utils/common.py              # small helpers
+memo/memory.py               # per-user LRU memory store
+memo/history.py              # history relevance + semantic helpers
+static/                      # minimal frontend (index.html, script.js, styles.css)
+Dockerfile                   # container image
+requirements.txt             # Python dependencies
+```
+## Quickstart (Local)
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+export MONGO_URI="mongodb://localhost:27017"
+uvicorn app:app --reload
+```
+Open UI: `http://localhost:8000/static/`
+Health: `http://localhost:8000/healthz`
+## Configuration
+Environment variables:
+- **MONGO_URI**: MongoDB connection string (required)
+- **MONGO_DB**: MongoDB database name (default: studybuddy)
+- **ATLAS_VECTOR**: set to "1" to enable Atlas Vector Search, else local cosine (default: 0)
+- **MONGO_VECTOR_INDEX**: Atlas Search index name for vectors (default: vector_index)
+- **EMBED_MODEL**: sentence-transformers model name (default: sentence-transformers/all-MiniLM-L6-v2)
+- **GEMINI_API_1..5**: Gemini API keys for rotation
+- **NVIDIA_API_1..5**: NVIDIA API keys for rotation
+- **GEMINI_SMALL, GEMINI_MED, GEMINI_PRO**: override default Gemini models
+- **NVIDIA_SMALL**: override default NVIDIA small model
+- Optional logging controls: use process env like `PYTHONWARNINGS=ignore` and manage verbosity per logger if needed
+Logging: Logs are sent to stdout at INFO level, tagged per module, e.g., `[APP]`, `[RAG]`. See `utils/logger.py`.
+## Running (Local)
+```bash
+export MONGO_URI="mongodb://localhost:27017"  # or Atlas URI
+uvicorn app:app --reload --workers 1 --host 0.0.0.0 --port 8000
+```
+Open the UI: `http://localhost:8000/static/`
+Health check: `http://localhost:8000/healthz`
+## Running (Docker)
+Build and run:
+```bash
+docker build -t studybuddy-rag .
+docker run --rm -p 8000:8000 \
+  -e MONGO_URI="<your-mongo-uri>" \
+  -e MONGO_DB="studybuddy" \
+  -e NVIDIA_API_1="<nvidia-key>" \
+  -e GEMINI_API_1="<gemini-key>" \
+  studybuddy-rag
+```
+For production, consider `--restart unless-stopped` and setting `--env ATLAS_VECTOR=1` if using Atlas Vector Search.
+## API Overview
+- GET `/` → serves `static/index.html`
+- POST `/upload` (multipart form-data)
+  - fields: `user_id` (str), `files` (one or more PDF/DOCX)
+  - response: `{ job_id, status: "processing" }`; ingestion proceeds in background
+- GET `/cards`
+  - params: `user_id` (str), `filename` (optional), `limit` (int), `skip` (int)
+  - returns stored cards without embeddings
+- GET `/file-summary`
+  - params: `user_id`, `filename`
+  - returns `{ filename, summary }`
+- POST `/chat` (form-urlencoded)
+  - fields: `user_id`, `question`, `k` (int, default 6)
+  - logic:
+    - If question matches "what is <file> about?": returns file summary
+    - Else: classify relevant files via NVIDIA, augment with chat memory context, run vector search (restricted to relevant files if any), select model, generate answer, store QA summary in LRU
+  - returns `{ answer, sources }` (and `relevant_files` when no hits)
+Example cURL:
+```bash
+curl -X POST http://localhost:8000/chat \
+  -H 'Content-Type: application/x-www-form-urlencoded' \
+  -d 'user_id=user1' \
+  --data-urlencode 'question=Summarize reinforcement learning from the uploaded notes.'
+```
+Upload example:
+```bash
+curl -X POST http://localhost:8000/upload \
+  -H 'Content-Type: multipart/form-data' \
+  -F 'user_id=user1' \
+  -F 'files=@/path/to/file1.pdf' \
+  -F 'files=@/path/to/file2.docx'
+```
+List cards:
+```bash
+curl 'http://localhost:8000/cards?user_id=user1&limit=10'
+```
+## MongoDB Atlas Vector Index (optional)
+If using Atlas Vector Search, create an index (UI or API) similar to:
+```json
+{
+  "mappings": {
+    "dynamic": false,
+    "fields": {
+      "embedding": {
+        "type": "knnVector",
+        "dimensions": 384,
+        "similarity": "cosine"
+      }
+    }
+  }
+}
+```
+Set `ATLAS_VECTOR=1` and `MONGO_VECTOR_INDEX` accordingly.
+Schema overview:
+- Collection `chunks` (per card):
+  - `user_id` (str), `filename` (str), `topic_name` (str), `summary` (str), `content` (str)
+  - `page_span` ([int, int])
+  - `card_id` (slug + sequence)
+  - `embedding` (float[384])
+- Collection `files` (per file):
+  - `user_id` (str), `filename` (str), `summary` (str)
+## Notes on Models and Keys
+- NVIDIA and Gemini calls use a simple key rotator. Provide one or more keys via `NVIDIA_API_1..5`, `GEMINI_API_1..5`.
+- The app is defensive: if embeddings or summarization models are unavailable, it falls back to naive strategies to keep the app responsive (with reduced quality).
+## Logging and Observability
+- Logs are tagged by module via `utils/logger.py`:
+  - [APP] app lifecycle, ingestion, chat flow
+  - [RAG] storage, vector search
+  - [EMBED] embedding model loads and fallbacks
+  - [CAPTION] BLIP model loads and captioning
+  - [ROUTER]/[ROTATOR] model routing and retry/rotation events
+  - [CHUNKER]/[SUM]/[COMMON]/[PARSER] module-specific messages
+- Change verbosity by setting the root logger level in code if needed
+## Performance and Cost Tips
+- Disable image captioning if CPU-bound by short-circuiting in `utils/caption.py` (return "")
+- Use smaller `k` in `/chat` for fewer chunks
+- Prefer NVIDIA_SMALL for simple questions (already default via router)
+- If Atlas Vector is unavailable, local cosine search samples up to 2000 docs; tune in `utils/rag.py`
+- Run with `--workers` and consider a process manager for production
+## Security Notes
+- CORS is currently open (`allow_origins=["*"]`) for simplicity. Restrict in production
+- Validate and limit upload sizes at the reverse proxy (e.g., nginx) or add checks in `/upload`
+- Secrets are passed via environment; avoid committing them
+## Troubleshooting
+- Missing Python packages: install via `pip install -r requirements.txt`.
+- Ingestion stalls: check `[APP]` logs; large files and image captioning (BLIP) can be slow on CPU.
+- No vector hits:
+  - Ensure documents were embedded and stored (see `[RAG] Inserted ... cards` logs)
+  - Verify `MONGO_URI` and collection contents
+  - If Atlas Vector is on, confirm index exists and `ATLAS_VECTOR=1`
+- NVIDIA/Gemini errors: see `[ROUTER]`/`[ROTATOR]` logs; key rotation retries transient errors.
+ - PIL/transformers/torch issues on ARM Macs: ensure correct torch build or disable captioning
+ - PyMuPDF font warnings: generally safe to ignore; upgrade PyMuPDF if needed
+## Development
+- Code style: straightforward, explicit names, tagged logging
+- Frontend: simple static site in `static/`
+- Extend chunking/embeddings or swap providers by editing modules in `utils/`
+- Optional Makefile targets you can add:
+```Makefile
+run:
+	uvicorn app:app --reload
+docker-build:
+	docker build -t studybuddy-rag .
+docker-run:
+	docker run --rm -p 8000:8000 -e MONGO_URI="mongodb://host.docker.internal:27017" studybuddy-rag
+```
+## License
+MIT (or your preferred license). Replace this section if needed.

app.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import os, io, re, uuid, json, time, logging
+from typing import List, Dict, Any, Optional
+from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse, HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from utils.rotator import APIKeyRotator
+from utils.parser import parse_pdf_bytes, parse_docx_bytes
+from utils.caption import BlipCaptioner
+from utils.chunker import build_cards_from_pages
+from utils.embeddings import EmbeddingClient
+from utils.rag import RAGStore, ensure_indexes
+from utils.router import select_model, generate_answer_with_model
+from utils.summarizer import cheap_summarize
+from utils.common import trim_text
+from utils.logger import get_logger
+# ────────────────────────────── App Setup ──────────────────────────────
+logger = get_logger("APP", name="studybuddy")
+app = FastAPI(title="StudyBuddy RAG", version="0.1.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Serve static files (index.html, scripts.js, styles.css)
+app.mount("/static", StaticFiles(directory="static"), name="static")
+# ────────────────────────────── Global Clients ──────────────────────────────
+# API rotators (round robin + auto failover on quota errors)
+gemini_rotator = APIKeyRotator(prefix="GEMINI_API_", max_slots=5)
+nvidia_rotator = APIKeyRotator(prefix="NVIDIA_API_", max_slots=5)
+# Captioner + Embeddings (lazy init inside classes)
+captioner = BlipCaptioner()
+embedder = EmbeddingClient(model_name=os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2"))
+# Mongo / RAG store
+rag = RAGStore(mongo_uri=os.getenv("MONGO_URI"), db_name=os.getenv("MONGO_DB", "studybuddy"))
+ensure_indexes(rag)
+# ────────────────────────────── Helpers ──────────────────────────────
+def _infer_mime(filename: str) -> str:
+    lower = filename.lower()
+    if lower.endswith(".pdf"):
+        return "application/pdf"
+    if lower.endswith(".docx"):
+        return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    return "application/octet-stream"
+def _extract_pages(filename: str, file_bytes: bytes) -> List[Dict[str, Any]]:
+    mime = _infer_mime(filename)
+    if mime == "application/pdf":
+        return parse_pdf_bytes(file_bytes)
+    elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        return parse_docx_bytes(file_bytes)
+    else:
+        raise HTTPException(status_code=400, detail=f"Unsupported file type: {filename}")
+# ────────────────────────────── Routes ──────────────────────────────
+@app.get("/", response_class=HTMLResponse)
+def index():
+    index_path = os.path.join("static", "index.html")
+    if not os.path.exists(index_path):
+        return HTMLResponse("<h1>StudyBuddy RAG</h1><p>Static files not found.</p>")
+    return FileResponse(index_path)
+@app.post("/upload")
+async def upload_files(
+    request: Request,
+    background_tasks: BackgroundTasks,
+    user_id: str = Form(...),
+    files: List[UploadFile] = File(...),
+):
+    """
+    Ingest many files: PDF/DOCX.
+    Steps:
+    1) Extract text & images
+    2) Caption images (BLIP base, CPU ok)
+    3) Merge captions into page text
+    4) Chunk into semantic cards (topic_name, summary, content + metadata)
+    5) Embed with all-MiniLM-L6-v2
+    6) Store in MongoDB with per-user and per-filename metadata
+    7) Create a file-level summary
+    """
+    job_id = str(uuid.uuid4())
+    # Read file bytes upfront to avoid reading from closed streams in background task
+    preloaded_files = []
+    for uf in files:
+        raw = await uf.read()
+        preloaded_files.append((uf.filename, raw))
+    # Process files in background
+    async def _process():
+        total_cards = 0
+        file_summaries = []
+        for fname, raw in preloaded_files:
+            logger.info(f"[{job_id}] Parsing {fname} ({len(raw)} bytes)")
+            # Extract pages from file
+            pages = _extract_pages(fname, raw)
+            # Caption images per page (if any)
+            num_imgs = sum(len(p.get("images", [])) for p in pages)
+            captions = []
+            if num_imgs > 0:
+                for p in pages:
+                    caps = []
+                    for im in p.get("images", []):
+                        try:
+                            cap = captioner.caption_image(im)
+                            caps.append(cap)
+                        except Exception as e:
+                            logger.warning(f"Caption error: {e}")
+                    captions.append(caps)
+            else:
+                captions = [[] for _ in pages]
+            # Merge captions into text
+            for idx, p in enumerate(pages):
+                if captions[idx]:
+                    p["text"] = (p.get("text", "") + "\n\n" + "\n".join([f"[Image] {c}" for c in captions[idx]])).strip()
+            # Build cards
+            cards = build_cards_from_pages(pages, filename=fname, user_id=user_id)
+            logger.info(f"[{job_id}] Built {len(cards)} cards for {fname}")
+            # Embed & store
+            embeddings = embedder.embed([c["content"] for c in cards])
+            for c, vec in zip(cards, embeddings):
+                c["embedding"] = vec
+            # Store cards in MongoDB on card
+            rag.store_cards(cards)
+            total_cards += len(cards)
+            # File-level summary (cheap extractive)
+            full_text = "\n\n".join(p.get("text", "") for p in pages)
+            file_summary = cheap_summarize(full_text, max_sentences=6)
+            rag.upsert_file_summary(user_id=user_id, filename=fname, summary=file_summary)
+            file_summaries.append({"filename": fname, "summary": file_summary})
+        logger.info(f"[{job_id}] Ingestion complete. Total cards: {total_cards}")
+    # Kick off processing in background to keep UI responsive
+    background_tasks.add_task(_process)
+    return {"job_id": job_id, "status": "processing"}
+@app.get("/cards")
+def list_cards(user_id: str, filename: Optional[str] = None, limit: int = 50, skip: int = 0):
+    return rag.list_cards(user_id=user_id, filename=filename, limit=limit, skip=skip)
+@app.get("/file-summary")
+def get_file_summary(user_id: str, filename: str):
+    doc = rag.get_file_summary(user_id=user_id, filename=filename)
+    if not doc:
+        raise HTTPException(404, detail="No summary found for that file.")
+    return {"filename": filename, "summary": doc.get("summary", "")}
+@app.post("/chat")
+async def chat(user_id: str = Form(...), question: str = Form(...), k: int = Form(6)):
+    """
+    RAG chat that answers ONLY from uploaded materials.
+    - Preload all filenames + summaries; use NVIDIA to classify file relevance to question (true/false)
+    - Restrict vector search to relevant files (fall back to all if none)
+    - Bring in recent chat memory: last 3 via NVIDIA relevance; remaining 17 via semantic search
+    - After answering, summarize (q,a) via NVIDIA and store into LRU (last 20)
+    """
+    from memo.memory import MemoryLRU
+    from memo.history import summarize_qa_with_nvidia, files_relevance, related_recent_and_semantic_context
+    from utils.router import NVIDIA_SMALL  # reuse default name
+    memory = app.state.__dict__.setdefault("memory_lru", MemoryLRU())
+    # 0) If question is about a specific file, return the file summary
+    m = re.search(r"what\s+is\s+the\s+(.+?\.(pdf|docx))\s+about\??", question, re.IGNORECASE)
+    # If the question is about a specific file, return the file summary
+    if m:
+        fn = m.group(1)
+        doc = rag.get_file_summary(user_id=user_id, filename=fn)
+        if doc:
+            return {"answer": doc.get("summary", ""), "sources": [{"filename": fn, "file_summary": True}]}
+        else:
+            return {"answer": "I couldn't find a summary for that file in your library.", "sources": []}
+    # 1) Preload file list + summaries
+    files_list = rag.list_files(user_id=user_id)  # [{filename, summary}]
+    # Ask NVIDIA to mark relevance per file
+    relevant_map = await files_relevance(question, files_list, nvidia_rotator)
+    relevant_files = [fn for fn, ok in relevant_map.items() if ok]
+    # 2) Memory context: recent 3 via NVIDIA, remaining 17 via semantic
+    # recent 3 related (we do a simple include-all; NVIDIA will prune by "related" selection using the same mechanism as files_relevance but here handled in history)
+    recent_related, semantic_related = await related_recent_and_semantic_context(user_id, question, memory, embedder)
+    # For recent_related (empty placeholder), do NVIDIA pruning now:
+    recent3 = memory.recent(user_id, 3)
+    if recent3:
+        sys = "Pick only items that directly relate to the new question. Output the selected items verbatim, no commentary. If none, output nothing."
+        numbered = [{"id": i+1, "text": s} for i, s in enumerate(recent3)]
+        user = f"Question: {question}\nCandidates:\n{json.dumps(numbered, ensure_ascii=False)}\nSelect any related items and output ONLY their 'text' values concatenated."
+        try:
+            from utils.rotator import robust_post_json
+            key = nvidia_rotator.get_key()
+            url = "https://integrate.api.nvidia.com/v1/chat/completions"
+            payload = {
+                "model": os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct"),
+                "temperature": 0.0,
+                "messages": [
+                    {"role": "system", "content": sys},
+                    {"role": "user", "content": user},
+                ]
+            }
+            headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key or ''}"}
+            data = await robust_post_json(url, headers, payload, nvidia_rotator)
+            recent_related = data["choices"][0]["message"]["content"].strip()
+        except Exception as e:
+            logger.warning(f"Recent-related NVIDIA error: {e}")
+            recent_related = ""
+    # 3) RAG vector search (restricted to relevant files if any)
+    q_vec = embedder.embed([question])[0]
+    hits = rag.vector_search(user_id=user_id, query_vector=q_vec, k=k, filenames=relevant_files if relevant_files else None)
+    if not hits:
+        return {
+            "answer": "I don't know based on your uploaded materials. Try uploading more sources or rephrasing the question.",
+            "sources": [],
+            "relevant_files": relevant_files
+        }
+    # Compose context
+    contexts = []
+    sources_meta = []
+    for h in hits:
+        doc = h["doc"]
+        score = h["score"]
+        contexts.append(f"[{doc.get('topic_name','Topic')}] {trim_text(doc.get('content',''), 1200)}")
+        sources_meta.append({
+            "filename": doc.get("filename"),
+            "topic_name": doc.get("topic_name"),
+            "page_span": doc.get("page_span"),
+            "score": float(score),
+            "chunk_id": str(doc.get("_id", ""))
+        })
+    context_text = "\n\n---\n\n".join(contexts)
+    # Add file-level summaries for relevant files
+    file_summary_block = ""
+    if relevant_files:
+        fsum_map = {f["filename"]: f.get("summary","") for f in files_list}
+        lines = [f"[{fn}] {fsum_map.get(fn, '')}" for fn in relevant_files]
+        file_summary_block = "\n".join(lines)
+    # Guardrail instruction to avoid hallucination
+    system_prompt = (
+        "You are a careful study assistant. Answer strictly using the given CONTEXT.\n"
+        "If the answer isn't in the context, say 'I don't know based on the provided materials.'\n"
+        "Write concise, clear explanations with citations like (source: filename, topic).\n"
+    )
+    # Add recent chat context and historical similarity context
+    history_block = ""
+    if recent_related or semantic_related:
+        history_block = "RECENT_CHAT_CONTEXT:\n" + (recent_related or "") + ("\n\nHISTORICAL_SIMILARITY_CONTEXT:\n" + semantic_related if semantic_related else "")
+    composed_context = ""
+    if history_block:
+        composed_context += history_block + "\n\n"
+    if file_summary_block:
+        composed_context += "FILE_SUMMARIES:\n" + file_summary_block + "\n\n"
+    composed_context += "DOC_CONTEXT:\n" + context_text
+    # Compose user prompt
+    user_prompt = f"QUESTION:\n{question}\n\nCONTEXT:\n{composed_context}"
+    # Choose model (cost-aware)
+    selection = select_model(question=question, context=composed_context)
+    logger.info(f"Model selection: {selection}")
+    # Generate answer with model
+    try:
+        answer = await generate_answer_with_model(
+            selection=selection,
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            gemini_rotator=gemini_rotator,
+            nvidia_rotator=nvidia_rotator
+        )
+    except Exception as e:
+        logger.error(f"LLM error: {e}")
+        answer = "I had trouble contacting the language model provider just now. Please try again."
+    # After answering: summarize QA and store in memory (LRU, last 20)
+    try:
+        qa_sum = await summarize_qa_with_nvidia(question, answer, nvidia_rotator)
+        memory.add(user_id, qa_sum)
+    except Exception as e:
+        logger.warning(f"QA summarize/store failed: {e}")
+    # Trim for logging
+    logger.info("LLM answer (trimmed): %s", trim_text(answer, 200).replace("\n", " "))
+    return {"answer": answer, "sources": sources_meta}
+@app.get("/healthz")
+def health():
+    return {"ok": True}

dw_model.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# dw_model.py
+### --- A. transformer and embedder ---
+import os
+import shutil
+from huggingface_hub import snapshot_download
+# Set up paths
+MODEL_REPO = "sentence-transformers/all-MiniLM-L6-v2"
+MODEL_CACHE_DIR = "/app/model_cache"
+HF_CACHE_DIR = os.getenv("HF_HOME", "/home/user/.cache/huggingface")
+print("⏳ Downloading the SentenceTransformer model...")
+# Download directly into /app/model_cache to avoid duplicating files from HF cache
+model_path = snapshot_download(
+    repo_id=MODEL_REPO,
+    cache_dir=HF_CACHE_DIR,              # Store HF cache in user cache dir
+    local_dir=MODEL_CACHE_DIR,           # Place usable model here
+    local_dir_use_symlinks=False         # Copy files into local_dir (no symlinks)
+)
+print("Model path: ", model_path)
+if not os.path.exists(MODEL_CACHE_DIR):
+    os.makedirs(MODEL_CACHE_DIR)
+# Verify structure after moving
+print("\n📂 LLM Model Structure (Build Level):")
+for root, dirs, files in os.walk(MODEL_CACHE_DIR):
+    print(f"📁 {root}/")
+    for file in files:
+        print(f"  📄 {file}")
+### --- B. translation modules ---
+# Optional pre-download of translation models. These can be very large and
+# may exceed build storage limits on constrained environments (e.g., HF Spaces).
+# Control with env var PRELOAD_TRANSLATORS ("1" to enable; default: disabled).
+PRELOAD_TRANSLATORS = os.getenv("PRELOAD_TRANSLATORS", "0")
+if PRELOAD_TRANSLATORS == "1":
+    try:
+        from transformers import pipeline
+        print("⏬ Pre-downloading Vietnamese–English translator...")
+        _ = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
+        print("⏬ Pre-downloading Chinese–English translator...")
+        _ = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
+        print("✅ Translators preloaded.")
+    except Exception as e:
+        print(f"⚠️ Skipping translator preload due to error: {e}")
+else:
+    print("ℹ️ Skipping translator pre-download (PRELOAD_TRANSLATORS != '1'). They will lazy-load at runtime.")

memo/history.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# ────────────────────────────── memo/history.py ──────────────────────────────
+import os
+import json
+import logging
+from typing import List, Dict, Any, Tuple
+import numpy as np
+from utils.logger import get_logger
+from utils.rotator import robust_post_json
+from utils.embeddings import EmbeddingClient
+logger = get_logger("RAG", __name__)
+NVIDIA_SMALL = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
+async def _nvidia_chat(system_prompt: str, user_prompt: str, nvidia_key: str, rotator) -> str:
+    """
+    Minimal NVIDIA Chat call that enforces no-comment concise outputs.
+    """
+    url = "https://integrate.api.nvidia.com/v1/chat/completions"
+    payload = {
+        "model": NVIDIA_SMALL,
+        "temperature": 0.0,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+    }
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {nvidia_key or ''}"}
+    data = None
+    try:
+        data = await robust_post_json(url, headers, payload, rotator)
+        return data["choices"][0]["message"]["content"]
+    except Exception as e:
+        logger.warning(f"NVIDIA chat error: {e} • response: {data}")
+        return ""
+def _safe_json(s: str) -> Any:
+    try:
+        return json.loads(s)
+    except Exception:
+        # Try to extract a JSON object from text
+        start = s.find("{")
+        end = s.rfind("}")
+        if start != -1 and end != -1 and end > start:
+            try:
+                return json.loads(s[start:end+1])
+            except Exception:
+                return {}
+        return {}
+async def summarize_qa_with_nvidia(question: str, answer: str, rotator) -> str:
+    """
+    Returns a single line block:
+    q: <concise>\na: <concise>
+    No extra commentary.
+    """
+    sys = "You are a terse summarizer. Output exactly two lines:\nq: <short question summary>\na: <short answer summary>\nNo extra text."
+    user = f"Question:\n{question}\n\nAnswer:\n{answer}"
+    key = rotator.get_key()
+    out = await _nvidia_chat(sys, user, key, rotator)
+    # Basic guard if the model returns extra prose
+    lines = [ln.strip() for ln in out.splitlines() if ln.strip()]
+    ql = next((l for l in lines if l.lower().startswith('q:')), None)
+    al = next((l for l in lines if l.lower().startswith('a:')), None)
+    if not ql or not al:
+        # Fallback truncate
+        ql = "q: " + (question.strip()[:160] + ("…" if len(question.strip()) > 160 else ""))
+        al = "a: " + (answer.strip()[:220] + ("…" if len(answer.strip()) > 220 else ""))
+    return f"{ql}\n{al}"
+async def files_relevance(question: str, file_summaries: List[Dict[str, str]], rotator) -> Dict[str, bool]:
+    """
+    Ask NVIDIA model to mark each file as relevant (true) or not (false) for the question.
+    Returns {filename: bool}
+    """
+    sys = "You classify file relevance. Return STRICT JSON only with shape {\"relevance\":[{\"filename\":\"...\",\"relevant\":true|false}]}."
+    items = [{"filename": f["filename"], "summary": f.get("summary","")} for f in file_summaries]
+    user = f"Question: {question}\n\nFiles:\n{json.dumps(items, ensure_ascii=False)}\n\nReturn JSON only."
+    key = rotator.get_key()
+    out = await _nvidia_chat(sys, user, key, rotator)
+    data = _safe_json(out) or {}
+    rels = {}
+    for row in data.get("relevance", []):
+        fn = row.get("filename")
+        rv = row.get("relevant")
+        if isinstance(fn, str) and isinstance(rv, bool):
+            rels[fn] = rv
+    # If parsing failed, default to considering all files possibly relevant
+    if not rels and file_summaries:
+        rels = {f["filename"]: True for f in file_summaries}
+    return rels
+def _cosine(a: np.ndarray, b: np.ndarray) -> float:
+    denom = (np.linalg.norm(a) * np.linalg.norm(b)) or 1.0
+    return float(np.dot(a, b) / denom)
+def _as_text(block: str) -> str:
+    return block.strip()
+async def related_recent_and_semantic_context(user_id: str, question: str, memory, embedder: EmbeddingClient, topk_sem: int = 3) -> Tuple[str, str]:
+    """
+    Returns (recent_related_text, semantic_related_text).
+    - recent_related_text: NVIDIA checks the last 3 summaries for direct relatedness.
+    - semantic_related_text: cosine-sim search over the remaining 17 summaries (top-k).
+    """
+    recent3 = memory.recent(user_id, 3)
+    rest17 = memory.rest(user_id, 3)
+    recent_text = ""
+    if recent3:
+        sys = "Pick only items that directly relate to the new question. Output the selected items verbatim, no commentary. If none, output nothing."
+        numbered = [{"id": i+1, "text": s} for i, s in enumerate(recent3)]
+        user = f"Question: {question}\nCandidates:\n{json.dumps(numbered, ensure_ascii=False)}\nSelect any related items and output ONLY their 'text' lines concatenated."
+        key = None  # We'll let robust_post_json handle rotation via rotator param
+        # Use the same nvidia rotator mechanism via a fake call; we'll reconstruct in app with the real rotator passed through
+        # Here, we expect the caller to monkey-patch the chat with rotator; to keep it simple, we'll do a tiny trick:
+        # The real API call occurs in app with rotator. For here, we return empty and let app request do it. (But to keep module self-contained, we do call with rotator when provided.)
+    # However, since this function is called from app and gets the rotator, we'll move NVIDIA call out of here to avoid circular deps.
+    # We'll implement a pure semantic search for rest17 here; recent related will be handled in app using the same prompt.
+    # Semantic over rest17
+    sem_text = ""
+    if rest17:
+        qv = np.array(embedder.embed([question])[0], dtype="float32")
+        mats = embedder.embed([_as_text(s) for s in rest17])
+        sims = [(_cosine(qv, np.array(v, dtype="float32")), s) for v, s in zip(mats, rest17)]
+        sims.sort(key=lambda x: x[0], reverse=True)
+        top = [s for (sc, s) in sims[:topk_sem] if sc > 0.15]  # small threshold
+        if top:
+            sem_text = "\n\n".join(top)
+    # Return recent empty (to be filled by caller using NVIDIA), and semantic text
+    return ("", sem_text)

memo/memory.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# ────────────────────────────── memo/memory.py ──────────────────────────────
+from collections import deque, defaultdict
+from typing import List, Dict
+class MemoryLRU:
+    """
+    Per-user LRU-like memory of the last N (default 20) summarized chat sessions.
+    Each item is a single string in the format: "q: ...\na: ..."
+    """
+    def __init__(self, capacity: int = 20):
+        self.capacity = capacity
+        self._store: Dict[str, deque] = defaultdict(lambda: deque(maxlen=self.capacity))
+    def add(self, user_id: str, qa_summary: str):
+        self._store[user_id].append(qa_summary)
+    def recent(self, user_id: str, n: int = 3) -> List[str]:
+        d = self._store[user_id]
+        if not d:
+            return []
+        # Return last n in recency order (most recent first)
+        return list(d)[-n:][::-1]
+    def rest(self, user_id: str, skip_n: int = 3) -> List[str]:
+        d = self._store[user_id]
+        if not d:
+            return []
+        # Everything except the most recent `skip_n`, oldest first
+        return list(d)[:-skip_n] if len(d) > skip_n else []
+    def all(self, user_id: str) -> List[str]:
+        return list(self._store[user_id])

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi              #==0.114.2
+uvicorn[standard]    #==0.30.6
+python-multipart     #==0.0.9
+pymongo              #==4.8.0
+httpx                #==0.27.2
+python-docx          #==1.1.2
+PyMuPDF              #==1.24.10
+pillow               #==10.4.0
+transformers         #==4.44.2
+torch                #==2.4.0
+sentence-transformers#==3.1.1
+sumy                 #==0.11.0
+numpy                #==1.26.4a

static/index.html ADDED Viewed

	@@ -0,0 +1,47 @@

+<!-- ────────────────────────────── static/index.html ────────────────────────────── -->
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <title>StudyBuddy</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <link rel="stylesheet" href="/static/styles.css">
+</head>
+<body>
+  <div class="container">
+    <header>
+      <h1>📚 StudyBuddy</h1>
+      <p>Upload your PDFs/DOCX, then chat with your materials. No hallucinations — answers only come from your files.</p>
+    </header>
+    <section class="card">
+      <h2>1/ Upload materials</h2>
+      <form id="upload-form">
+        <label>User ID</label>
+        <input type="text" id="user_id" placeholder="e.g., user_123" required>
+        <label>Files (PDF/DOCX, multiple)</label>
+        <input type="file" id="files" multiple accept=".pdf,.docx">
+        <button type="submit">Upload</button>
+      </form>
+      <pre id="upload-log"></pre>
+    </section>
+    <section class="card">
+      <h2>2/ Ask questions</h2>
+      <div id="chat">
+        <div id="messages"></div>
+        <div class="chat-controls">
+          <input type="text" id="question" placeholder="Ask something about your files…">
+          <button id="ask">Ask</button>
+        </div>
+      </div>
+    </section>
+    <footer>
+      <small>StudyBuddy RAG • FastAPI on Hugging Face Spaces • MongoDB Vector • BLIP captions</small>
+    </footer>
+  </div>
+  <script src="/static/scripts.js"></script>
+</body>
+</html>

static/script.js ADDED Viewed

	@@ -0,0 +1,72 @@

+// ────────────────────────────── static/script.js ──────────────────────────────
+const log = (msg) => {
+    const el = document.getElementById("upload-log");
+    el.textContent += msg + "\n";
+    el.scrollTop = el.scrollHeight;
+  };
+  // Upload
+  document.getElementById("upload-form").addEventListener("submit", async (e) => {
+    e.preventDefault();
+    const user_id = document.getElementById("user_id").value.trim();
+    const files = document.getElementById("files").files;
+    if (!user_id || files.length === 0) {
+      alert("Provide user id and at least one file.");
+      return;
+    }
+    const fd = new FormData();
+    fd.append("user_id", user_id);
+    for (let f of files) fd.append("files", f);
+    log("Uploading " + files.length + " file(s)…");
+    const res = await fetch("/upload", { method: "POST", body: fd });
+    const data = await res.json();
+    log("Upload accepted. Job: " + (data.job_id || "?") + " • status: " + (data.status || "?"));
+    log("Processing in the background. You can start chatting meanwhile.");
+  });
+  // Chat
+  document.getElementById("ask").addEventListener("click", async () => {
+    const user_id = document.getElementById("user_id").value.trim();
+    const q = document.getElementById("question").value.trim();
+    if (!user_id || !q) return;
+    appendMessage("user", q);
+    document.getElementById("question").value = "";
+    const fd = new FormData();
+    fd.append("user_id", user_id);
+    fd.append("question", q);
+    fd.append("k", "6");
+    try {
+      const res = await fetch("/chat", { method: "POST", body: fd });
+      const data = await res.json();
+      appendMessage("assistant", data.answer || "[no answer]");
+      if (data.sources && data.sources.length) {
+        appendSources(data.sources);
+      }
+    } catch (e) {
+      appendMessage("assistant", "⚠️ Error contacting server.");
+    }
+  });
+  function appendMessage(role, text) {
+    const m = document.createElement("div");
+    m.className = "msg " + role;
+    m.textContent = text;
+    document.getElementById("messages").appendChild(m);
+    m.scrollIntoView({ behavior: "smooth", block: "end" });
+  }
+  function appendSources(sources) {
+    const wrap = document.createElement("div");
+    wrap.className = "sources";
+    wrap.innerHTML = "<strong>Sources:</strong> " + sources.map(s => {
+      const f = s.filename || "unknown";
+      const t = s.topic_name ? (" • " + s.topic_name) : "";
+      const p = s.page_span ? (" [pp. " + s.page_span.join("-") + "]") : "";
+      return `<span class="pill">${f}${t}${p}</span>`;
+    }).join(" ");
+    document.getElementById("messages").appendChild(wrap);
+    wrap.scrollIntoView({ behavior: "smooth", block: "end" });
+  }

static/styles.css ADDED Viewed

	@@ -0,0 +1,66 @@

+/* ────────────────────────────── static/styles.css ────────────────────────────── */
+:root {
+    --bg: #0b1020;
+    --card: #12193a;
+    --text: #e6ecff;
+    --muted: #9bb0ff;
+    --accent: #7aa2ff;
+    --pill: #1f2a5c;
+    --green: #41d6a5;
+  }
+  * { box-sizing: border-box; }
+  body {
+    margin: 0;
+    font-family: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, sans-serif;
+    color: var(--text);
+    background: radial-gradient(1200px 600px at 20% -10%, #18225a, var(--bg));
+  }
+  .container {
+    max-width: 960px;
+    margin: 0 auto;
+    padding: 24px;
+  }
+  header h1 { margin: 0 0 8px; }
+  header p { color: var(--muted); margin: 0 0 16px; }
+  .card {
+    background: var(--card);
+    border: 1px solid #1f2750;
+    border-radius: 16px;
+    padding: 16px;
+    margin: 16px 0;
+    box-shadow: 0 10px 30px rgba(0,0,0,0.25);
+  }
+  label { display: block; margin: 8px 0 6px; color: var(--muted); }
+  input[type="text"], input[type="file"] {
+    width: 100%; padding: 10px 12px; border-radius: 12px; border: 1px solid #2a3570;
+    background: #0f1430; color: var(--text);
+  }
+  button {
+    margin-top: 12px;
+    background: linear-gradient(135deg, var(--accent), #5bc7ff);
+    color: #0a0f25; border: none; border-radius: 12px; padding: 10px 16px; font-weight: 600;
+    cursor: pointer;
+  }
+  button:hover { filter: brightness(1.07); }
+  #upload-log {
+    height: 120px; overflow: auto; background: #0f1430; padding: 10px; border-radius: 12px; border: 1px solid #2a3570;
+    color: #b9c7ff;
+  }
+  #chat { display: flex; flex-direction: column; gap: 12px; }
+  #messages {
+    height: 300px; overflow: auto; background: #0f1430; padding: 12px; border-radius: 12px; border: 1px solid #2a3570;
+  }
+  .msg { padding: 10px 12px; border-radius: 12px; margin: 6px 0; max-width: 80%; white-space: pre-wrap; }
+  .msg.user { margin-left: auto; background: #173361; }
+  .msg.assistant { background: #0f244d; border: 1px solid #243a7a; }
+  .sources { margin: 8px 0; }
+  .pill { display: inline-block; background: var(--pill); padding: 4px 8px; border-radius: 999px; margin: 2px; color: #cbd6ff; border: 1px solid #304088; }
+  footer { text-align: center; color: var(--muted); margin-top: 24px; }

utils/caption.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# ────────────────────────────── utils/caption.py ──────────────────────────────
+from typing import Optional
+from PIL import Image
+import logging
+from .logger import get_logger
+# Use transformers BLIP base (CPU friendly)
+try:
+    from transformers import BlipProcessor, BlipForConditionalGeneration
+except Exception as e:
+    BlipProcessor = None
+    BlipForConditionalGeneration = None
+logger = get_logger("CAPTION", __name__)
+class BlipCaptioner:
+    def __init__(self):
+        self._ready = False
+        self.processor = None
+        self.model = None
+    def _lazy_load(self):
+        if self._ready:
+            return
+        if BlipProcessor is None or BlipForConditionalGeneration is None:
+            logger.warning("transformers not available; image captions will be skipped.")
+            self._ready = True
+            return
+        logger.info("Loading BLIP captioner (base)…")
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        self._ready = True
+    def caption_image(self, image: Image.Image) -> str:
+        self._lazy_load()
+        if self.processor is None or self.model is None:
+            return ""
+        inputs = self.processor(images=image, return_tensors="pt")
+        out = self.model.generate(**inputs, max_new_tokens=40)
+        return self.processor.decode(out[0], skip_special_tokens=True).strip()

utils/chunker.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# ────────────────────────────── utils/chunker.py ──────────────────────────────
+import re
+from typing import List, Dict, Any
+from .summarizer import cheap_summarize
+from .common import split_sentences, slugify
+from .logger import get_logger
+# Heuristic "semantic" chunker:
+# - Split by headings / numbered sections if present
+# - Ensure each chunk ~ 300-600 words (configurable)
+# - Generate a short summary + topic name
+MAX_WORDS = 500
+MIN_WORDS = 150
+logger = get_logger("CHUNKER", __name__)
+def _by_headings(text: str):
+    # split on markdown-like or outline headings
+    pattern = r"(?m)^(#{1,6}\s.*|[0-9]+\.\s+[^\n]+|[A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$"
+    parts = []
+    last = 0
+    for m in re.finditer(pattern, text):
+        start = m.start()
+        if start > last:
+            parts.append(text[last:start])
+        parts.append(text[start:m.end()])
+        last = m.end()
+    if last < len(text):
+        parts.append(text[last:])
+    if not parts:
+        parts = [text]
+    return parts
+def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str) -> List[Dict[str, Any]]:
+    # Concatenate pages but keep page spans for metadata
+    full = ""
+    page_markers = []
+    for p in pages:
+        start = len(full)
+        full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n"
+        page_markers.append((p['page_num'], start, len(full)))
+    # First split by headings
+    coarse = _by_headings(full)
+    # Then pack into 150-500 word chunks
+    cards = []
+    buf = []
+    buf_words = 0
+    start_idx = 0
+    for block in coarse:
+        words = block.split()
+        if not words:
+            continue
+        if buf_words + len(words) > MAX_WORDS and buf_words >= MIN_WORDS:
+            cards.append(" ".join(buf))
+            buf, buf_words = [], 0
+            start_idx = len(" ".join(coarse[:coarse.index(block)]))  # approximate
+        buf.extend(words)
+        buf_words += len(words)
+    if buf_words > 0:
+        cards.append(" ".join(buf))
+    # Build card dicts
+    out = []
+    for i, content in enumerate(cards, 1):
+        topic = cheap_summarize(content, max_sentences=1)
+        if not topic:
+            topic = content[:80] + "..."
+        summary = cheap_summarize(content, max_sentences=3)
+        # Estimate page span
+        first_page = pages[0]['page_num'] if pages else 1
+        last_page = pages[-1]['page_num'] if pages else 1
+        out.append({
+            "user_id": user_id,
+            "filename": filename,
+            "topic_name": topic[:120],
+            "summary": summary,
+            "content": content,
+            "page_span": [first_page, last_page],
+            "card_id": f"{slugify(filename)}-c{i:04d}"
+        })
+    logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}")
+    return out

utils/common.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import re
+import unicodedata
+from .logger import get_logger
+logger = get_logger("COMMON", __name__)
+def split_sentences(text: str):
+    return re.split(r"(?<=[\.\!\?])\s+", text.strip())
+def slugify(value: str):
+    value = str(value)
+    value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
+    value = re.sub(r"[^\w\s-]", "", value).strip().lower()
+    return re.sub(r"[-\s]+", "-", value)
+def trim_text(s: str, n: int):
+    s = s or ""
+    if len(s) <= n:
+        return s
+    return s[:n] + "…"

utils/embeddings.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# ────────────────────────────── utils/embeddings.py ──────────────────────────────
+import os
+from typing import List
+import numpy as np
+import logging
+from .logger import get_logger
+try:
+    from sentence_transformers import SentenceTransformer
+except Exception:
+    SentenceTransformer = None
+logger = get_logger("EMBED", __name__)
+class EmbeddingClient:
+    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
+        self.model_name = model_name
+        self.model = None
+    def _lazy(self):
+        if self.model is None and SentenceTransformer is not None:
+            logger.info(f"Loading embedding model: {self.model_name}")
+            self.model = SentenceTransformer(self.model_name)
+    def embed(self, texts: List[str]) -> List[list]:
+        self._lazy()
+        if self.model is None:
+            # Fallback: extremely naive hashing -> NOT for production, but keeps code running without deps
+            logger.warning("SentenceTransformer unavailable; using random fallback embeddings.")
+            return [list(np.random.default_rng(hash(t) % (2**32)).normal(size=384).astype("float32")) for t in texts]
+        vecs = self.model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
+        return [v.tolist() for v in vecs]

utils/logger.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import logging
+import sys
+from typing import Optional
+_DEFAULT_FORMAT = "%(asctime)s %(levelname)s %(message)s"
+def _ensure_root_handler() -> None:
+    root_logger = logging.getLogger()
+    if root_logger.handlers:
+        return
+    handler = logging.StreamHandler(stream=sys.stdout)
+    formatter = logging.Formatter(_DEFAULT_FORMAT)
+    handler.setFormatter(formatter)
+    root_logger.addHandler(handler)
+    root_logger.setLevel(logging.INFO)
+class _TaggedAdapter(logging.LoggerAdapter):
+    def process(self, msg, kwargs):
+        tag = self.extra.get("tag", "")
+        if tag and not str(msg).startswith(tag):
+            msg = f"{tag} {msg}"
+        return msg, kwargs
+def get_logger(tag: str, name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a logger that injects a [TAG] prefix into records.
+    Example: logger = get_logger("APP") → logs like: [APP] message
+    """
+    _ensure_root_handler()
+    logger_name = name or __name__
+    base = logging.getLogger(logger_name)
+    return _TaggedAdapter(base, {"tag": f"[{tag}]"})

utils/parser.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import io
+from typing import List, Dict, Any
+import fitz  # PyMuPDF
+from docx import Document
+from PIL import Image
+import numpy as np
+from .logger import get_logger
+logger = get_logger("PARSER", __name__)
+def parse_pdf_bytes(b: bytes) -> List[Dict[str, Any]]:
+    """
+    Returns list of pages, each {'page_num': i, 'text': str, 'images': [PIL.Image]}
+    """
+    pages = []
+    with fitz.open(stream=b, filetype="pdf") as doc:
+        for i, page in enumerate(doc):
+            text = page.get_text("text")
+            images = []
+            for img in page.get_images(full=True):
+                xref = img[0]
+                pix = fitz.Pixmap(doc, xref)
+                if pix.n - pix.alpha >= 4:  # CMYK
+                    pix = fitz.Pixmap(fitz.csRGB, pix)
+                im = Image.frombytes("RGBA" if pix.alpha else "RGB", (pix.width, pix.height), pix.samples)
+                images.append(im.convert("RGB"))
+                pix = None
+            pages.append({"page_num": i + 1, "text": text, "images": images})
+    logger.info(f"Parsed PDF with {len(pages)} pages")
+    return pages
+def parse_docx_bytes(b: bytes) -> List[Dict[str, Any]]:
+    f = io.BytesIO(b)
+    doc = Document(f)
+    text = []
+    images = []
+    for rel in doc.part.rels.values():
+        if "image" in rel.reltype:
+            data = rel.target_part.blob
+            try:
+                im = Image.open(io.BytesIO(data)).convert("RGB")
+                images.append(im)
+            except Exception:
+                pass
+    for p in doc.paragraphs:
+        text.append(p.text)
+    pages = [{"page_num": 1, "text": "\n".join(text), "images": images}]
+    logger.info("Parsed DOCX into single concatenated page")
+    return pages

utils/rag.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# ────────────────────────────── utils/rag.py ──────────────────────────────
+import os
+import math
+from typing import List, Dict, Any, Optional
+from pymongo import MongoClient, ASCENDING, TEXT
+from pymongo.collection import Collection
+from pymongo.errors import PyMongoError
+import numpy as np
+from .logger import get_logger
+VECTOR_DIM = 384  # all-MiniLM-L6-v2
+INDEX_NAME = os.getenv("MONGO_VECTOR_INDEX", "vector_index")
+USE_ATLAS_VECTOR = os.getenv("ATLAS_VECTOR", "0") == "1"
+logger = get_logger("RAG", __name__)
+class RAGStore:
+    def __init__(self, mongo_uri: str, db_name: str = "studybuddy"):
+        self.client = MongoClient(mongo_uri)
+        self.db = self.client[db_name]
+        self.chunks: Collection = self.db["chunks"]
+        self.files: Collection = self.db["files"]
+    # ── Write ────────────────────────────────────────────────────────────────
+    def store_cards(self, cards: List[Dict[str, Any]]):
+        if not cards:
+            return
+        for c in cards:
+            # basic validation
+            emb = c.get("embedding")
+            if not emb or len(emb) != VECTOR_DIM:
+                raise ValueError("Invalid embedding length; expected %d" % VECTOR_DIM)
+        self.chunks.insert_many(cards, ordered=False)
+        logger.info(f"Inserted {len(cards)} cards into MongoDB")
+    def upsert_file_summary(self, user_id: str, filename: str, summary: str):
+        self.files.update_one(
+            {"user_id": user_id, "filename": filename},
+            {"$set": {"summary": summary}},
+            upsert=True
+        )
+        logger.info(f"Upserted summary for {filename} (user {user_id})")
+    # ── Read ────────────────────────────────────────────────────────────────
+    def list_cards(self, user_id: str, filename: Optional[str], limit: int, skip: int):
+        q = {"user_id": user_id}
+        if filename:
+            q["filename"] = filename
+        cur = self.chunks.find(q, {"embedding": 0}).skip(skip).limit(limit).sort([("_id", ASCENDING)])
+        return list(cur)
+    def list_files(self, user_id: str) -> List[Dict[str, Any]]:
+        cur = self.files.find({"user_id": user_id}, {"_id": 0})
+        return list(cur)
+    def get_file_summary(self, user_id: str, filename: str):
+        return self.files.find_one({"user_id": user_id, "filename": filename})
+    def vector_search(self, user_id: str, query_vector: List[float], k: int = 6, filenames: Optional[List[str]] = None):
+        if USE_ATLAS_VECTOR:
+            # Atlas Vector Search (requires pre-created index on 'embedding')
+            pipeline = [
+                {
+                    "$search": {
+                        "index": INDEX_NAME,
+                        "knnBeta": {
+                            "vector": query_vector,
+                            "path": "embedding",
+                            "k": k,
+                        },
+                        "filter": {"equals": {"path": "user_id", "value": user_id}},
+                    }
+                },
+                {"$project": {"embedding": 0, "score": {"$meta": "searchScore"}, "doc": "$$ROOT"}},
+            ]
+            if filenames:
+                pipeline.append({"$match": {"doc.filename": {"$in": filenames}}})
+            pipeline.append({"$limit": k})
+            hits = list(self.chunks.aggregate(pipeline))
+            return [{"doc": h["doc"], "score": h["score"]} for h in hits]
+        # Fallback: scan limited sample and compute cosine locally
+        else:
+            q = {"user_id": user_id}
+            # Apply filename filter if provided
+            if filenames:
+                q["filename"] = {"$in": filenames}
+            # Scan limited sample and compute cosine locally
+            sample = list(self.chunks.find(q).limit(max(2000, k*10)))
+            # If no sample, return empty list
+            if not sample:
+                return []
+            # Compute cosine similarity for each sample
+            qv = np.array(query_vector, dtype="float32")
+            scores = []
+            # Compute cosine similarity for each sample
+            for d in sample:
+                v = np.array(d.get("embedding", [0]*VECTOR_DIM), dtype="float32")
+                denom = (np.linalg.norm(qv) * np.linalg.norm(v)) or 1.0
+                sim = float(np.dot(qv, v) / denom)
+                scores.append((sim, d))
+            # Sort scores by cosine similarity in descending order
+            scores.sort(key=lambda x: x[0], reverse=True)
+            # Get top k sc ores
+            top = scores[:k]
+            # Log the results
+            logger.info(f"Vector search sample={len(sample)} returned top={len(top)}")
+            return [{"doc": d, "score": s} for (s, d) in top]
+def ensure_indexes(store: RAGStore):
+    # Basic text index for fallback keyword search (optional)
+    try:
+        store.chunks.create_index([("user_id", ASCENDING), ("filename", ASCENDING)])
+        store.chunks.create_index([("content", TEXT), ("topic_name", TEXT), ("summary", TEXT)], name="text_idx")
+        store.files.create_index([("user_id", ASCENDING), ("filename", ASCENDING)], unique=True)
+    except PyMongoError as e:
+        logger.warning(f"Index creation warning: {e}")
+    # Note: For Atlas Vector, create an Atlas Search index named INDEX_NAME on field "embedding" with vector options.
+    # Example (in Atlas UI):
+    # {
+    #   "mappings": {
+    #     "dynamic": false,
+    #     "fields": {
+    #       "embedding": {
+    #         "type": "knnVector",
+    #         "dimensions": 384,
+    #         "similarity": "cosine"
+    #       }
+    #     }
+    #   }
+    # }

utils/rotator.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# ────────────────────────────── utils/rotator.py ──────────────────────────────
+import os
+import itertools
+import logging
+from .logger import get_logger
+from typing import Optional
+import httpx
+logger = get_logger("ROTATOR", __name__)
+class APIKeyRotator:
+    """
+    Round-robin API key rotator.
+    - Loads keys from env vars with given prefix (e.g., GEMINI_API_1..5)
+    - get_key() returns current key
+    - rotate() moves to next key
+    - on HTTP 401/429/5xx you should call rotate() and retry (bounded)
+    """
+    def __init__(self, prefix: str, max_slots: int = 5):
+        self.keys = []
+        for i in range(1, max_slots + 1):
+            v = os.getenv(f"{prefix}{i}")
+            if v:
+                self.keys.append(v.strip())
+        if not self.keys:
+            logger.warning(f"No API keys found for prefix {prefix}. Calls will likely fail.")
+            self._cycle = itertools.cycle([""])
+        else:
+            self._cycle = itertools.cycle(self.keys)
+        self.current = next(self._cycle)
+    def get_key(self) -> Optional[str]:
+        return self.current
+    def rotate(self) -> Optional[str]:
+        self.current = next(self._cycle)
+        logger.info("Rotated API key.")
+        return self.current
+async def robust_post_json(url: str, headers: dict, payload: dict, rotator: APIKeyRotator, max_retries: int = 5):
+    """
+    POST JSON with simple retry+rotate on 401/403/429/5xx.
+    Returns json response.
+    """
+    for attempt in range(max_retries):
+        try:
+            async with httpx.AsyncClient(timeout=60) as client:
+                r = await client.post(url, headers=headers, json=payload)
+                if r.status_code in (401, 403, 429) or (500 <= r.status_code < 600):
+                    logger.warning(f"HTTP {r.status_code} from provider. Rotating key and retrying ({attempt+1}/{max_retries})")
+                    rotator.rotate()
+                    continue
+                r.raise_for_status()
+                return r.json()
+        except Exception as e:
+            logger.warning(f"Request error: {e}. Rotating and retrying ({attempt+1}/{max_retries})")
+            rotator.rotate()
+    raise RuntimeError("Provider request failed after retries.")

utils/router.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# ────────────────────────────── utils/router.py ──────────────────────────────
+import os
+import logging
+from .logger import get_logger
+from typing import Dict, Any
+from .rotator import robust_post_json, APIKeyRotator
+logger = get_logger("ROUTER", __name__)
+# Default model names (can be overridden via env)
+GEMINI_SMALL = os.getenv("GEMINI_SMALL", "gemini-2.5-flash-lite")
+GEMINI_MED   = os.getenv("GEMINI_MED",   "gemini-2.5-flash")
+GEMINI_PRO   = os.getenv("GEMINI_PRO",   "gemini-2.5-pro")
+# NVIDIA small default (can be override)
+NVIDIA_SMALL = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")  # example; adjust to your NIM catalog
+def select_model(question: str, context: str) -> Dict[str, Any]:
+    """
+    Very lightweight complexity heuristic:
+    - If long question or lots of context -> MED/PRO
+    - If code/math keywords -> PRO
+    - Else SMALL
+    Prefers NVIDIA small when question is short/simple (cost-awareness).
+    """
+    qlen = len(question.split())
+    clen = len(context.split())
+    hard_keywords = ("prove", "derivation", "complexity", "algorithm", "optimize", "theorem", "rigorous", "step-by-step", "policy critique", "ambiguity", "counterfactual")
+    is_hard = any(k in question.lower() for k in hard_keywords) or qlen > 60 or clen > 1600
+    if is_hard:
+        # Use Gemini Pro (larger context)
+        return {"provider": "gemini", "model": GEMINI_PRO}
+    elif qlen > 25 or clen > 900:
+        return {"provider": "gemini", "model": GEMINI_MED}
+    else:
+        # Prefer NVIDIA small for cheap/light
+        return {"provider": "nvidia", "model": NVIDIA_SMALL}
+async def generate_answer_with_model(selection: Dict[str, Any], system_prompt: str, user_prompt: str,
+                                     gemini_rotator: APIKeyRotator, nvidia_rotator: APIKeyRotator) -> str:
+    provider = selection["provider"]
+    model = selection["model"]
+    if provider == "gemini":
+        key = gemini_rotator.get_key() or ""
+        url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={key}"
+        payload = {
+            "contents": [
+                {"role": "user", "parts": [{"text": f"{system_prompt}\n\n{user_prompt}"}]}
+            ],
+            "generationConfig": {"temperature": 0.2}
+        }
+        headers = {"Content-Type": "application/json"}
+        data = await robust_post_json(url, headers, payload, gemini_rotator)
+        try:
+            return data["candidates"][0]["content"]["parts"][0]["text"]
+        except Exception:
+            logger.warning(f"Unexpected Gemini response: {data}")
+            return "I couldn't parse the model response."
+    elif provider == "nvidia":
+        # Many NVIDIA endpoints are OpenAI-compatible. Adjust if using a different path.
+        key = nvidia_rotator.get_key() or ""
+        url = "https://integrate.api.nvidia.com/v1/chat/completions"
+        payload = {
+            "model": model,
+            "temperature": 0.2,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ]
+        }
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
+        data = await robust_post_json(url, headers, payload, nvidia_rotator)
+        try:
+            return data["choices"][0]["message"]["content"]
+        except Exception:
+            logger.warning(f"Unexpected NVIDIA response: {data}")
+            return "I couldn't parse the model response."
+    return "Unsupported provider."

utils/summarizer.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from typing import List
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.lex_rank import LexRankSummarizer
+from .logger import get_logger
+logger = get_logger("SUM", __name__)
+def cheap_summarize(text: str, max_sentences: int = 3) -> str:
+    try:
+        parser = PlaintextParser.from_string(text, Tokenizer("english"))
+        summarizer = LexRankSummarizer()
+        sentences = summarizer(parser.document, max_sentences)
+        return " ".join(str(s) for s in sentences)
+    except Exception:
+        # Fallback: naive first N sentences
+        logger.warning("sumy unavailable or failed; using naive summarization fallback.")
+        parts = text.split(". ")
+        return ". ".join(parts[:max_sentences])

warmup.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from sentence_transformers import SentenceTransformer
+import torch
+import os
+print("🚀 Warming up model...")
+embedding_model = SentenceTransformer("/app/model_cache", device="cpu")
+# Some CPU backends on HF Spaces fail on .half(); make it configurable
+USE_HALF = os.getenv("EMBEDDING_HALF", "1") == "1"
+try:
+    if USE_HALF and torch.cuda.is_available():
+        embedding_model = embedding_model.half()
+except Exception as e:
+    print(f"⚠️ Skipping half precision due to: {e}")
+embedding_model.to(torch.device("cpu"))
+print("✅ Model warm-up complete!")