Spaces:

samyakshrestha
/

finetuned-mistral-rag-app

Paused

App Files Files Community

samyakshrestha commited on May 12

Commit

0f7b282

1 Parent(s): bf951a0

First commit

Browse files

Files changed (4) hide show

.DS_Store +0 -0
Dockerfile +16 -0
app.py +181 -0
requirements.txt +12 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.10-slim
+# System deps for git-lfs (model pulls) and faster tokenization wheels
+RUN apt-get update && apt-get install -y git-lfs && git lfs install
+WORKDIR /app
+# Install Python deps first for cache efficiency
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+EXPOSE 7860
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# ------------------------------------------------------------------
+#  app.py
+#  FastAPI + Gradio hybrid RAG service
+#  (c) Samyak Shrestha — 2025
+# ------------------------------------------------------------------
+import os, json, time
+from pathlib import Path
+from typing import List
+import torch
+from fastapi import FastAPI
+from pydantic import BaseModel
+import gradio as gr
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+)
+from huggingface_hub import hf_hub_download
+import faiss
+from sentence_transformers import SentenceTransformer
+import numpy as np
+# ------------------------------------------------------------------
+# Configuration
+# ------------------------------------------------------------------
+HF_MODEL_ID   = "samyakshrestha/merged-finetuned-mistral"   # weights + FAISS live here
+EMBED_MODEL   = "BAAI/bge-base-en-v1.5"
+DEVICE          = "cuda" if torch.cuda.is_available() else "cpu"
+TOP_K           = 5
+CTX_TOKEN_LIMIT = 2048
+MAX_NEW_TOKENS  = 256
+DATA_DIR        = Path("data")                 # cached at runtime
+DATA_DIR.mkdir(exist_ok=True)
+FAISS_BIN_NAME  = "data/faiss_index/faiss_index.bin"
+META_JSON_NAME  = "data/faiss_index/chunk_metadata.json"
+INDEX_PATH      = DATA_DIR / "faiss_index.bin"
+META_PATH       = DATA_DIR / "chunk_metadata.json"
+# ------------------------------------------------------------------
+# 1) Embedding model
+# ------------------------------------------------------------------
+print("Loading embedding model …")
+embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
+embed_dim = embedder.get_sentence_embedding_dimension()
+print(f"{EMBED_MODEL}  ({embed_dim}-d vectors)")
+# ------------------------------------------------------------------
+# 2) Download / load FAISS index + metadata
+# ------------------------------------------------------------------
+def download_assets():
+    if not INDEX_PATH.exists():
+        print("Downloading FAISS index from Hub …")
+        hf_hub_download(
+            repo_id   = HF_MODEL_ID,
+            filename  = FAISS_BIN_NAME,
+            local_dir = DATA_DIR,
+            local_dir_use_symlinks=False,
+        )
+    if not META_PATH.exists():
+        print("Downloading metadata …")
+        hf_hub_download(
+            repo_id   = HF_MODEL_ID,
+            filename  = META_JSON_NAME,
+            local_dir = DATA_DIR,
+            local_dir_use_symlinks=False,
+        )
+download_assets()
+print("Loading FAISS index …")
+index = faiss.read_index(str(INDEX_PATH))
+with open(META_PATH) as f:
+    chunk_metadata = json.load(f)
+assert index.ntotal == len(chunk_metadata), "Index / metadata size mismatch"
+print(f"vectors = {index.ntotal}")
+# ------------------------------------------------------------------
+# 3) Load language model (4-bit if bitsandbytes is available)
+# ------------------------------------------------------------------
+print("Loading LoRA-fine-tuned Mistral …")
+bnb_cfg = None
+try:
+    import bitsandbytes  # noqa: F401
+    bnb_cfg = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type="nf4",
+    )
+    print("bitsandbytes detected → 4-bit quant enabled")
+except ImportError:
+    print("bitsandbytes not found → loading in fp16 / fp32")
+tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True)
+model = AutoModelForCausalLM.from_pretrained(
+    HF_MODEL_ID,
+    device_map="auto" if DEVICE == "cuda" else None,
+    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+    quantization_config=bnb_cfg,
+)
+model.eval()
+print("model ready")
+# ------------------------------------------------------------------
+# 4) Retrieval & Generation helpers
+# ------------------------------------------------------------------
+def retrieve_chunks(query: str, k: int = TOP_K) -> List[dict]:
+    emb = embedder.encode([query], normalize_embeddings=True)
+    _, idxs = index.search(emb, k)
+    return [chunk_metadata[int(i)] for i in idxs[0]]
+def build_prompt(query: str, chunks: List[dict]) -> str:
+    ctx_blocks, total_tokens = [], 0
+    for ch in chunks:
+        block = f"[{ch['title']}]\n{ch['text']}\n"
+        toks  = len(tokenizer.tokenize(block))
+        if total_tokens + toks <= CTX_TOKEN_LIMIT:
+            ctx_blocks.append(block)
+            total_tokens += toks
+    context = "\n\n".join(ctx_blocks)
+    return (
+        "You are an expert scientific assistant. "
+        "Use the excerpts to answer.\n\n"
+        f"Excerpts:\n{context}\n\n"
+        f"Question: {query}\nAnswer:"
+    )
+@torch.inference_mode()
+def generate_answer(query: str) -> str:
+    prompt  = build_prompt(query, retrieve_chunks(query))
+    inputs  = tokenizer(prompt, return_tensors="pt").to(DEVICE)
+    output  = model.generate(
+        **inputs,
+        max_new_tokens=MAX_NEW_TOKENS,
+        do_sample=False,
+        top_p=1.0,
+    )
+    return (
+        tokenizer.decode(output[0], skip_special_tokens=True)
+        .split("Answer:")[-1]
+        .strip()
+    )
+# ------------------------------------------------------------------
+# 5) FastAPI backend
+# ------------------------------------------------------------------
+api = FastAPI(title="Finetuned Mistral RAG API")
+class Question(BaseModel):
+    question: str
+class Answer(BaseModel):
+    answer: str
+@api.post("/rag", response_model=Answer)
+def rag_endpoint(item: Question):
+    return Answer(answer=generate_answer(item.question))
+# ------------------------------------------------------------------
+# 6) Gradio chat UI
+# ------------------------------------------------------------------
+demo = gr.Interface(
+    fn   = generate_answer,
+    inputs  = gr.Textbox(label="Ask a question about LLM fine-tuning"),
+    outputs = gr.Textbox(label="Answer"),
+    title   = "Finetuned Mistral-7B — Retrieval-Augmented QA",
+)
+# ------------------------------------------------------------------
+# 7) Launch (Spaces exposes port 7860)
+# ------------------------------------------------------------------
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi==0.110.1
+uvicorn[standard]==0.29.0
+transformers==4.40.1
+huggingface_hub==0.23.0
+sentence-transformers==2.7.0
+faiss-cpu==1.7.4
+torch==2.2.2
+gradio==4.24.0
+pydantic>=2.6
+numpy
+bitsandbytes ; sys_platform == 'linux'   # only installs on Linux/GPU
+accelerate                                # optional, speeds HF model I/O