Spaces:

Qar-Raz
/

NLP-RAG

Sleeping

App Files Files Community

Muddasri commited on 15 days ago

Commit

1737e82

unverified ·

2 Parent(s): 44406e5 7b44ae2

Merge branch 'main' into Muddasir/BackendComplete

Browse files

Files changed (12) hide show

.gitignore +1 -0
api.py +295 -15
models/deepseek_v3.py +6 -3
models/llama_3_8b.py +5 -2
models/mistral_7b.py +5 -4
models/qwen_2_5.py +5 -2
models/tiny_aya.py +6 -4
requirements.txt +1 -0
retriever/generator.py +9 -2
retriever/processor.py +8 -3
retriever/retriever.py +11 -0
vector_db.py +81 -0

.gitignore CHANGED Viewed

@@ -25,6 +25,7 @@ dist/
 .mypy_cache/
 .ruff_cache/
 .ipynb_checkpoints/
 # IDE/editor
 .vscode/

 .mypy_cache/
 .ruff_cache/
 .ipynb_checkpoints/
+.cache/
 # IDE/editor
 .vscode/

api.py CHANGED Viewed

@@ -1,14 +1,18 @@
 # Fastapi endpoints defined here
 import os
 import time
 from typing import Any
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
-from vector_db import get_index_by_name, load_chunks_from_pinecone
 from retriever.retriever import HybridRetriever
 from retriever.generator import RAGGenerator
 from retriever.processor import ChunkProcessor
@@ -20,6 +24,9 @@ from models.deepseek_v3 import DeepSeek_V3
 from models.tiny_aya import TinyAya
 class PredictRequest(BaseModel):
     query: str = Field(..., min_length=1, description="User query text")
     model: str = Field(default="Llama-3-8B", description="Model name key")
@@ -36,6 +43,102 @@ class PredictResponse(BaseModel):
     metrics: dict[str, float]
 # Fastapi setup
 # Fastapi allows us to define python based endpoint
@@ -89,10 +192,16 @@ def _resolve_model(name: str, models: dict[str, Any]) -> tuple[str, Any]:
 @app.on_event("startup")
 def startup_event() -> None:
     load_dotenv()
     hf_token = os.getenv("HF_TOKEN")
     pinecone_api_key = os.getenv("PINECONE_API_KEY")
     if not pinecone_api_key:
         raise RuntimeError("PINECONE_API_KEY not found in environment variables")
@@ -101,35 +210,71 @@ def startup_event() -> None:
     index_name = "cbt-book-recursive"
     embed_model_name = "all-MiniLM-L6-v2"
-    startup_start = time.perf_counter()
     index = get_index_by_name(
         api_key=pinecone_api_key,
         index_name=index_name
     )
     chunks_start = time.perf_counter()
-    final_chunks = load_chunks_from_pinecone(index)
     chunk_load_time = time.perf_counter() - chunks_start
     if not final_chunks:
         raise RuntimeError("No chunks found in Pinecone metadata. Run indexing once before API mode.")
-    proc = ChunkProcessor(model_name=embed_model_name, verbose=False)
     retriever = HybridRetriever(final_chunks, proc.encoder, verbose=False)
     rag_engine = RAGGenerator()
     models = _build_models(hf_token)
     state["index"] = index
     state["retriever"] = retriever
     state["rag_engine"] = rag_engine
     state["models"] = models
     startup_time = time.perf_counter() - startup_start
     print(
         f"API startup complete | chunks={len(final_chunks)} | "
-        f"chunk_load={chunk_load_time:.3f}s | total={startup_time:.3f}s"
     )
@@ -139,27 +284,65 @@ def health() -> dict[str, str]:
     return {"status": "ok" if ready else "starting"}
 # Predict endpoint that takes a query and returns an answer along with contexts and metrics
 # is called from the frontend when user clicks submits
 # Also resolves model based on user selection
 @app.post("/predict", response_model=PredictResponse)
 def predict(payload: PredictRequest) -> PredictResponse:
     if not state:
         raise HTTPException(status_code=503, detail="Service not initialized yet")
     query = payload.query.strip()
     if not query:
         raise HTTPException(status_code=400, detail="Query cannot be empty")
-    total_start = time.perf_counter()
     retriever: HybridRetriever = state["retriever"]
     index = state["index"]
     rag_engine: RAGGenerator = state["rag_engine"]
     models: dict[str, Any] = state["models"]
     model_name, model_instance = _resolve_model(payload.model, models)
     retrieval_start = time.perf_counter()
     contexts = retriever.search(
@@ -177,19 +360,116 @@ def predict(payload: PredictRequest) -> PredictResponse:
     if not contexts:
         raise HTTPException(status_code=404, detail="No context chunks retrieved for this query")
-    generation_start = time.perf_counter()
     answer = rag_engine.get_answer(model_instance, query, contexts, temperature=0.1)
-    generation_time = time.perf_counter() - generation_start
-    total_time = time.perf_counter() - total_start
     return PredictResponse(
         model=model_name,
         answer=answer,
         contexts=contexts,
-        metrics={
-            "retrieval_s": round(retrieval_time, 3),
-            "generation_s": round(generation_time, 3),
-            "total_s": round(total_time, 3),
         },
     )

 # Fastapi endpoints defined here
+import json
 import os
+import re
 import time
 from typing import Any
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from huggingface_hub import InferenceClient
 from pydantic import BaseModel, Field
+from vector_db import get_index_by_name, load_chunks_with_local_cache
 from retriever.retriever import HybridRetriever
 from retriever.generator import RAGGenerator
 from retriever.processor import ChunkProcessor
 from models.tiny_aya import TinyAya
+#Added cacheing and time logging to track every stages time
 class PredictRequest(BaseModel):
     query: str = Field(..., min_length=1, description="User query text")
     model: str = Field(default="Llama-3-8B", description="Model name key")
     metrics: dict[str, float]
+class TitleRequest(BaseModel):
+    query: str = Field(..., min_length=1, description="First user message")
+class TitleResponse(BaseModel):
+    title: str
+    source: str
+def _to_ndjson(payload: dict[str, Any]) -> str:
+    return json.dumps(payload, ensure_ascii=False) + "\n"
+# simpliest possible implementation to determine chat title
+# is fallback incase hf generation fails.
+def _title_from_query(query: str) -> str:
+    stop_words = {
+        "a", "an", "and", "are", "as", "at", "be", "by", "can", "do", "for", "from", "how",
+        "i", "in", "is", "it", "me", "my", "of", "on", "or", "please", "show", "tell", "that",
+        "the", "this", "to", "we", "what", "when", "where", "which", "why", "with", "you", "your",
+    }
+    words = re.findall(r"[A-Za-z0-9][A-Za-z0-9\-_/+]*", query)
+    if not words:
+        return "New Chat"
+    filtered: list[str] = []
+    for word in words:
+        cleaned = word.strip("-_/+")
+        if not cleaned:
+            continue
+        if cleaned.lower() in stop_words:
+            continue
+        filtered.append(cleaned)
+        if len(filtered) >= 6:
+            break
+    chosen = filtered if filtered else words[:6]
+    normalized = [w.capitalize() if w.islower() else w for w in chosen]
+    title = " ".join(normalized).strip()
+    return title[:80] if title else "New Chat"
+#actual code for title generation using hf model, uses a simple prompt to generate a concise title based on user query, with some formatting rules to ensure clean output. If generation fails or returns an empty title, falls back to rule-based method.
+# is called in the /predict/title endpoint
+def _clean_title_text(raw: str) -> str:
+    text = (raw or "").strip()
+    text = text.replace("\n", " ").replace("\r", " ")
+    text = re.sub(r"^[\"'`\s]+|[\"'`\s]+$", "", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    words = text.split()
+    if len(words) > 8:
+        text = " ".join(words[:8])
+    return text[:80]
+def _title_from_hf(query: str, client: InferenceClient, model_id: str) -> str | None:
+    system_prompt = (
+        "You generate short chat titles. Return only a title, no punctuation at the end, no quotes."
+    )
+    user_prompt = (
+        "Create a concise 3-7 word title for this user request:\n"
+        f"{query}"
+    )
+    response = client.chat_completion(
+        model=model_id,
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        max_tokens=24,
+        temperature=0.3,
+    )
+    if not response or not response.choices:
+        return None
+    raw_title = response.choices[0].message.content or ""
+    title = _clean_title_text(raw_title)
+    if not title or title.lower() == "new chat":
+        return None
+    return title
+def _parse_title_model_candidates() -> list[str]:
+    raw = os.getenv(
+        "TITLE_MODEL_IDS",
+        "Qwen/Qwen2.5-1.5B-Instruct,CohereLabs/tiny-aya-global,meta-llama/Meta-Llama-3-8B-Instruct",
+    )
+    models = [m.strip() for m in raw.split(",") if m.strip()]
+    return models or ["meta-llama/Meta-Llama-3-8B-Instruct"]
 # Fastapi setup
 # Fastapi allows us to define python based endpoint
 @app.on_event("startup")
 def startup_event() -> None:
+    startup_start = time.perf_counter()
+    dotenv_start = time.perf_counter()
     load_dotenv()
+    dotenv_time = time.perf_counter() - dotenv_start
+    env_start = time.perf_counter()
     hf_token = os.getenv("HF_TOKEN")
     pinecone_api_key = os.getenv("PINECONE_API_KEY")
+    env_time = time.perf_counter() - env_start
     if not pinecone_api_key:
         raise RuntimeError("PINECONE_API_KEY not found in environment variables")
     index_name = "cbt-book-recursive"
     embed_model_name = "all-MiniLM-L6-v2"
+    project_root = os.path.dirname(os.path.abspath(__file__))
+    cache_dir = os.getenv("BM25_CACHE_DIR", os.path.join(project_root, ".cache"))
+    force_cache_refresh = os.getenv("BM25_CACHE_REFRESH", "0").lower() in {"1", "true", "yes"}
+    index_start = time.perf_counter()
     index = get_index_by_name(
         api_key=pinecone_api_key,
         index_name=index_name
     )
+    index_time = time.perf_counter() - index_start
     chunks_start = time.perf_counter()
+    final_chunks, chunk_source = load_chunks_with_local_cache(
+        index=index,
+        index_name=index_name,
+        cache_dir=cache_dir,
+        batch_size=100,
+        force_refresh=force_cache_refresh,
+    )
     chunk_load_time = time.perf_counter() - chunks_start
     if not final_chunks:
         raise RuntimeError("No chunks found in Pinecone metadata. Run indexing once before API mode.")
+    processor_start = time.perf_counter()
+    proc = ChunkProcessor(model_name=embed_model_name, verbose=False, load_hf_embeddings=False)
+    processor_time = time.perf_counter() - processor_start
+    retriever_start = time.perf_counter()
     retriever = HybridRetriever(final_chunks, proc.encoder, verbose=False)
+    retriever_time = time.perf_counter() - retriever_start
+    rag_start = time.perf_counter()
     rag_engine = RAGGenerator()
+    rag_time = time.perf_counter() - rag_start
+    models_start = time.perf_counter()
     models = _build_models(hf_token)
+    models_time = time.perf_counter() - models_start
+    state_start = time.perf_counter()
     state["index"] = index
     state["retriever"] = retriever
     state["rag_engine"] = rag_engine
     state["models"] = models
+    state["title_model_ids"] = _parse_title_model_candidates()
+    state["title_client"] = InferenceClient(token=hf_token)
+    state_time = time.perf_counter() - state_start
     startup_time = time.perf_counter() - startup_start
     print(
         f"API startup complete | chunks={len(final_chunks)} | "
+        f"dotenv={dotenv_time:.3f}s | "
+        f"env={env_time:.3f}s | "
+        f"index={index_time:.3f}s | "
+        f"cache_dir={cache_dir} | "
+        f"force_cache_refresh={force_cache_refresh} | "
+        f"chunk_source={chunk_source} | "
+        f"chunk_load={chunk_load_time:.3f}s | "
+        f"processor={processor_time:.3f}s | "
+        f"retriever={retriever_time:.3f}s | "
+        f"rag={rag_time:.3f}s | "
+        f"models={models_time:.3f}s | "
+        f"state={state_time:.3f}s | "
+        f"total={startup_time:.3f}s"
     )
     return {"status": "ok" if ready else "starting"}
+#title generation endpoint
+# is called only once when we create a new chat, after first prompt
+@app.post("/predict/title", response_model=TitleResponse)
+def suggest_title(payload: TitleRequest) -> TitleResponse:
+    query = payload.query.strip()
+    if not query:
+        raise HTTPException(status_code=400, detail="Query cannot be empty")
+    fallback_title = _title_from_query(query)
+    title_client: InferenceClient | None = state.get("title_client")
+    title_model_ids: list[str] = state.get("title_model_ids", _parse_title_model_candidates())
+    if title_client is not None:
+        for title_model_id in title_model_ids:
+            try:
+                hf_title = _title_from_hf(query, title_client, title_model_id)
+                if hf_title:
+                    return TitleResponse(title=hf_title, source=f"hf:{title_model_id}")
+            except Exception as exc:
+                err_text = str(exc)
+                # Provider/model availability differs across HF accounts; skip unsupported models.
+                if "model_not_supported" in err_text or "not supported by any provider" in err_text:
+                    continue
+                print(f"Title generation model failed ({title_model_id}): {exc}")
+                continue
+    print("Title generation fallback triggered: no title model available/successful")
+    return TitleResponse(title=fallback_title, source="rule-based")
 # Predict endpoint that takes a query and returns an answer along with contexts and metrics
 # is called from the frontend when user clicks submits
 # Also resolves model based on user selection
 @app.post("/predict", response_model=PredictResponse)
 def predict(payload: PredictRequest) -> PredictResponse:
+    req_start = time.perf_counter()
+    precheck_start = time.perf_counter()
     if not state:
         raise HTTPException(status_code=503, detail="Service not initialized yet")
     query = payload.query.strip()
     if not query:
         raise HTTPException(status_code=400, detail="Query cannot be empty")
+    precheck_time = time.perf_counter() - precheck_start
+    state_access_start = time.perf_counter()
     retriever: HybridRetriever = state["retriever"]
     index = state["index"]
     rag_engine: RAGGenerator = state["rag_engine"]
     models: dict[str, Any] = state["models"]
+    state_access_time = time.perf_counter() - state_access_start
+    model_resolve_start = time.perf_counter()
     model_name, model_instance = _resolve_model(payload.model, models)
+    model_resolve_time = time.perf_counter() - model_resolve_start
     retrieval_start = time.perf_counter()
     contexts = retriever.search(
     if not contexts:
         raise HTTPException(status_code=404, detail="No context chunks retrieved for this query")
+    inference_start = time.perf_counter()
     answer = rag_engine.get_answer(model_instance, query, contexts, temperature=0.1)
+    inference_time = time.perf_counter() - inference_start
+    response_start = time.perf_counter()
+    metrics = {
+        "precheck_s": round(precheck_time, 3),
+        "state_access_s": round(state_access_time, 3),
+        "model_resolve_s": round(model_resolve_time, 3),
+        "retrieval_s": round(retrieval_time, 3),
+        "inference_s": round(inference_time, 3),
+    }
+    response_build_time = time.perf_counter() - response_start
+    total_time = time.perf_counter() - req_start
+    metrics["response_build_s"] = round(response_build_time, 3)
+    metrics["total_s"] = round(total_time, 3)
+    print(
+        f"Predict timing | model={model_name} | mode={payload.mode} | "
+        f"rerank={payload.rerank_strategy} | precheck={precheck_time:.3f}s | "
+        f"state_access={state_access_time:.3f}s | model_resolve={model_resolve_time:.3f}s | "
+        f"retrieval={retrieval_time:.3f}s | inference={inference_time:.3f}s | "
+        f"response_build={response_build_time:.3f}s | total={total_time:.3f}s"
+    )
     return PredictResponse(
         model=model_name,
         answer=answer,
         contexts=contexts,
+        metrics=metrics,
+    )
+# new endpoint for streaming response, allows frontend to render tokens as they come in instead of waiting for full answer
+@app.post("/predict/stream")
+def predict_stream(payload: PredictRequest) -> StreamingResponse:
+    req_start = time.perf_counter()
+    precheck_start = time.perf_counter()
+    if not state:
+        raise HTTPException(status_code=503, detail="Service not initialized yet")
+    query = payload.query.strip()
+    if not query:
+        raise HTTPException(status_code=400, detail="Query cannot be empty")
+    precheck_time = time.perf_counter() - precheck_start
+    state_access_start = time.perf_counter()
+    retriever: HybridRetriever = state["retriever"]
+    index = state["index"]
+    rag_engine: RAGGenerator = state["rag_engine"]
+    models: dict[str, Any] = state["models"]
+    state_access_time = time.perf_counter() - state_access_start
+    model_resolve_start = time.perf_counter()
+    model_name, model_instance = _resolve_model(payload.model, models)
+    model_resolve_time = time.perf_counter() - model_resolve_start
+    retrieval_start = time.perf_counter()
+    contexts = retriever.search(
+        query,
+        index,
+        mode=payload.mode,
+        rerank_strategy=payload.rerank_strategy,
+        use_mmr=True,
+        top_k=payload.top_k,
+        final_k=payload.final_k,
+        verbose=False,
+    )
+    retrieval_time = time.perf_counter() - retrieval_start
+    if not contexts:
+        raise HTTPException(status_code=404, detail="No context chunks retrieved for this query")
+    def stream_events():
+        inference_start = time.perf_counter()
+        answer_parts: list[str] = []
+        try:
+            for token in rag_engine.get_answer_stream(model_instance, query, contexts, temperature=0.1):
+                answer_parts.append(token)
+                yield _to_ndjson({"type": "token", "token": token})
+            inference_time = time.perf_counter() - inference_start
+            total_time = time.perf_counter() - req_start
+            answer = "".join(answer_parts)
+            metrics = {
+                "precheck_s": round(precheck_time, 3),
+                "state_access_s": round(state_access_time, 3),
+                "model_resolve_s": round(model_resolve_time, 3),
+                "retrieval_s": round(retrieval_time, 3),
+                "inference_s": round(inference_time, 3),
+                "total_s": round(total_time, 3),
+            }
+            yield _to_ndjson(
+                {
+                    "type": "done",
+                    "model": model_name,
+                    "answer": answer,
+                    "metrics": metrics,
+                }
+            )
+        except Exception as exc:
+            yield _to_ndjson({"type": "error", "message": f"Streaming failed: {exc}"})
+    return StreamingResponse(
+        stream_events(),
+        media_type="application/x-ndjson",
+        headers={
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
         },
     )

models/deepseek_v3.py CHANGED Viewed

@@ -17,7 +17,10 @@ class DeepSeek_V3:
             ):
                 if message.choices:
                     content = message.choices[0].delta.content
-                    if content: response += content
         except Exception as e:
-            return f" DeepSeek API Busy: {e}"
-        return response

             ):
                 if message.choices:
                     content = message.choices[0].delta.content
+                    if content:
+                        yield content
         except Exception as e:
+            yield f" DeepSeek API Busy: {e}"
+    def generate(self, prompt, max_tokens=500, temperature=0.1):
+        return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/llama_3_8b.py CHANGED Viewed

@@ -16,5 +16,8 @@ class Llama3_8B:
         ):
             if message.choices:
                 content = message.choices[0].delta.content
-                if content: response += content
-        return response

         ):
             if message.choices:
                 content = message.choices[0].delta.content
+                if content:
+                    yield content
+    def generate(self, prompt, max_tokens=500, temperature=0.1):
+        return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/mistral_7b.py CHANGED Viewed

@@ -18,9 +18,10 @@ class Mistral_7b:
             for chunk in stream:
                 if chunk.choices and chunk.choices[0].delta.content:
                     content = chunk.choices[0].delta.content
-                    response += content
         except Exception as e:
-            return f" Mistral Featherless Error: {e}"
-        return response

             for chunk in stream:
                 if chunk.choices and chunk.choices[0].delta.content:
                     content = chunk.choices[0].delta.content
+                    yield content
         except Exception as e:
+            yield f" Mistral Featherless Error: {e}"
+    def generate(self, prompt, max_tokens=500, temperature=0.1):
+        return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/qwen_2_5.py CHANGED Viewed

@@ -16,5 +16,8 @@ class Qwen2_5:
         ):
             if message.choices:
                 content = message.choices[0].delta.content
-                if content: response += content
-        return response

         ):
             if message.choices:
                 content = message.choices[0].delta.content
+                if content:
+                    yield content
+    def generate(self, prompt, max_tokens=500, temperature=0.1):
+        return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

models/tiny_aya.py CHANGED Viewed

@@ -18,8 +18,10 @@ class TinyAya:
             ):
                 if message.choices:
                     content = message.choices[0].delta.content
-                    if content: response += content
         except Exception as e:
-            return f" TinyAya Error: {e}"
-        return response

             ):
                 if message.choices:
                     content = message.choices[0].delta.content
+                    if content:
+                        yield content
         except Exception as e:
+            yield f" TinyAya Error: {e}"
+    def generate(self, prompt, max_tokens=500, temperature=0.1):
+        return "".join(self.generate_stream(prompt, max_tokens=max_tokens, temperature=temperature))

requirements.txt CHANGED Viewed

@@ -16,6 +16,7 @@ fastapi==0.121.1
 filelock==3.25.2
 frozenlist==1.8.0
 fsspec==2026.2.0
 greenlet==3.3.2
 h11==0.16.0
 hf-xet==1.4.2

 filelock==3.25.2
 frozenlist==1.8.0
 fsspec==2026.2.0
+groq
 greenlet==3.3.2
 h11==0.16.0
 hf-xet==1.4.2

retriever/generator.py CHANGED Viewed

@@ -1,8 +1,10 @@
 class RAGGenerator:
     def generate_prompt(self, query, retrieved_contexts):
         """Prepares the academic prompt template."""
         context_text = "\n\n".join([f"--- Source {i+1} ---\n{c}" for i, c in enumerate(retrieved_contexts)])
         return f"""You are an expert academic assistant. Use the following pieces of retrieved context to answer the question.
 If the answer isn't in the context, say you don't know based on the provided documents.
@@ -16,4 +18,9 @@ Answer:"""
     def get_answer(self, model_instance, query, retrieved_contexts, **kwargs):
         """Uses a specific model instance to generate the final answer."""
         prompt = self.generate_prompt(query, retrieved_contexts)
-        return model_instance.generate(prompt, **kwargs)

+#changed the prompt to output as markdown, plus some formating details
+#also added get answer stream for incremental token rendering on the frontend
+# --@Qamar
 class RAGGenerator:
     def generate_prompt(self, query, retrieved_contexts):
         """Prepares the academic prompt template."""
         context_text = "\n\n".join([f"--- Source {i+1} ---\n{c}" for i, c in enumerate(retrieved_contexts)])
         return f"""You are an expert academic assistant. Use the following pieces of retrieved context to answer the question.
 If the answer isn't in the context, say you don't know based on the provided documents.
     def get_answer(self, model_instance, query, retrieved_contexts, **kwargs):
         """Uses a specific model instance to generate the final answer."""
         prompt = self.generate_prompt(query, retrieved_contexts)
+        return model_instance.generate(prompt, **kwargs)
+    def get_answer_stream(self, model_instance, query, retrieved_contexts, **kwargs):
+        """Yields model output chunks so the frontend can render incremental tokens."""
+        prompt = self.generate_prompt(query, retrieved_contexts)
+        return model_instance.generate_stream(prompt, **kwargs)

retriever/processor.py CHANGED Viewed

@@ -14,11 +14,16 @@ import pandas as pd
 class ChunkProcessor:
-    def __init__(self, model_name='all-MiniLM-L6-v2', verbose: bool = True):
         self.model_name = model_name
         self.encoder = SentenceTransformer(model_name)
         self.verbose = verbose
-        self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)
     # ------------------------------------------------------------------
     # Splitters
@@ -84,7 +89,7 @@ class ChunkProcessor:
         elif technique == "semantic":
             return SemanticChunker(
-                self.hf_embeddings,
                 breakpoint_threshold_type=kwargs.get('breakpoint_threshold_type', "percentile"),
                 # Using 70 because 95 was giving way too big chunks
                 breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)

 class ChunkProcessor:
+    def __init__(self, model_name='all-MiniLM-L6-v2', verbose: bool = True, load_hf_embeddings: bool = False):
         self.model_name = model_name
         self.encoder = SentenceTransformer(model_name)
         self.verbose = verbose
+        self.hf_embeddings = HuggingFaceEmbeddings(model_name=model_name) if load_hf_embeddings else None
+    def _get_hf_embeddings(self):
+        if self.hf_embeddings is None:
+            self.hf_embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
+        return self.hf_embeddings
     # ------------------------------------------------------------------
     # Splitters
         elif technique == "semantic":
             return SemanticChunker(
+                self._get_hf_embeddings(),
                 breakpoint_threshold_type=kwargs.get('breakpoint_threshold_type', "percentile"),
                 # Using 70 because 95 was giving way too big chunks
                 breakpoint_threshold_amount=kwargs.get('breakpoint_threshold_amount', 70)

retriever/retriever.py CHANGED Viewed

@@ -30,6 +30,17 @@ class HybridRetriever:
         # Better tokenization for BM25 (strips punctuation)
         self.tokenized_corpus = [self._tokenize(chunk['metadata']['text']) for chunk in final_chunks]
         self.bm25 = BM25Okapi(self.tokenized_corpus)
     def _tokenize(self, text: str) -> List[str]:
         """Tokenize text using regex to strip punctuation."""

         # Better tokenization for BM25 (strips punctuation)
         self.tokenized_corpus = [self._tokenize(chunk['metadata']['text']) for chunk in final_chunks]
         self.bm25 = BM25Okapi(self.tokenized_corpus)
+        bm25_time = time.perf_counter() - bm25_start
+        total_time = time.perf_counter() - init_start
+        print(
+            "HybridRetriever init complete | "
+            f"chunks={len(final_chunks)} | "
+            f"reranker_load={reranker_time:.3f}s | "
+            f"tokenize={tokenization_time:.3f}s | "
+            f"bm25_build={bm25_time:.3f}s | "
+            f"total={total_time:.3f}s"
+        )
     def _tokenize(self, text: str) -> List[str]:
         """Tokenize text using regex to strip punctuation."""

vector_db.py CHANGED Viewed

@@ -1,7 +1,14 @@
 import time
 import re
 from pinecone import Pinecone, ServerlessSpec
 def slugify_technique(name):
     """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
     return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
@@ -109,6 +116,80 @@ def upsert_to_pinecone(index, chunks, batch_size=100):
         batch = chunks[i : i + batch_size]
         index.upsert(vectors=batch)
 def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
     """

 import time
 import re
+import json
+from pathlib import Path
+from typing import Any, Dict, List
 from pinecone import Pinecone, ServerlessSpec
+# Added cacheing to reduce consecutive startup time
+# --@Qamar
 def slugify_technique(name):
     """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
     return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
         batch = chunks[i : i + batch_size]
         index.upsert(vectors=batch)
+# Some methods for loading chunks back from Pinecone with local caching to speed up BM25 initialization
+def _sanitize_index_name(index_name: str) -> str:
+    return re.sub(r'[^a-zA-Z0-9._-]+', '-', index_name).strip('-') or 'default-index'
+def _chunk_cache_path(cache_dir: str, index_name: str) -> Path:
+    cache_root = Path(cache_dir)
+    cache_root.mkdir(parents=True, exist_ok=True)
+    safe_name = _sanitize_index_name(index_name)
+    return cache_root / f"bm25_chunks_{safe_name}.json"
+def _read_chunk_cache(path: Path) -> Dict[str, Any]:
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+def _write_chunk_cache(path: Path, payload: Dict[str, Any]) -> None:
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(payload, f)
+def load_chunks_with_local_cache(
+    index,
+    index_name: str,
+    cache_dir: str = ".cache",
+    batch_size: int = 100,
+    force_refresh: bool = False,
+) -> tuple[List[Dict[str, Any]], str]:
+    cache_file = _chunk_cache_path(cache_dir=cache_dir, index_name=index_name)
+    stats = index.describe_index_stats()
+    current_count = stats.get("total_vector_count", 0)
+    if not force_refresh and cache_file.exists():
+        try:
+            cached_payload = _read_chunk_cache(cache_file)
+            cached_meta = cached_payload.get("meta", {})
+            cached_count = cached_meta.get("vector_count", -1)
+            cached_chunks = cached_payload.get("chunks", [])
+            if cached_count == current_count and cached_chunks:
+                print(
+                    f" Loaded BM25 chunk cache: {cache_file} "
+                    f"(chunks={len(cached_chunks)}, vectors={cached_count})"
+                )
+                return cached_chunks, "cache"
+            print(
+                " BM25 cache stale or empty. "
+                f"cache_vectors={cached_count}, pinecone_vectors={current_count}. Refreshing..."
+            )
+        except Exception as e:
+            print(f" Failed to read BM25 cache ({cache_file}): {e}. Refreshing from Pinecone...")
+    chunks = load_chunks_from_pinecone(index=index, batch_size=batch_size)
+    payload = {
+        "meta": {
+            "index_name": index_name,
+            "vector_count": current_count,
+            "updated_at_epoch_s": int(time.time()),
+        },
+        "chunks": chunks,
+    }
+    try:
+        _write_chunk_cache(cache_file, payload)
+        print(f" Saved BM25 chunk cache: {cache_file} (chunks={len(chunks)})")
+    except Exception as e:
+        print(f" Failed to write BM25 cache ({cache_file}): {e}")
+    return chunks, "pinecone"
 def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
     """