Spaces:

Qar-Raz
/

NLP-RAG

Running

Qar-Raz commited on 17 days ago

Commit

02e0da8

1 Parent(s): 21adad0

added time metrics

Files changed (4) hide show

.gitignore CHANGED Viewed

@@ -1,4 +1,4 @@
-# Python bytecode
 __pycache__/
 *.py[cod]
 *$py.class

+# python specific ignores
 __pycache__/
 *.py[cod]
 *$py.class

api.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import time
 from typing import Any
@@ -38,6 +39,11 @@ class PredictResponse(BaseModel):
     metrics: dict[str, float]
 app = FastAPI(title="RAG-AS3 API", version="0.1.0")
 app.add_middleware(
@@ -77,6 +83,13 @@ def _resolve_model(name: str, models: dict[str, Any]) -> tuple[str, Any]:
     return model_key, models[model_key]
 @app.on_event("startup")
 def startup_event() -> None:
     load_dotenv()
@@ -131,6 +144,10 @@ def health() -> dict[str, str]:
     return {"status": "ok" if ready else "starting"}
 @app.post("/predict", response_model=PredictResponse)
 def predict(payload: PredictRequest) -> PredictResponse:
     if not state:

+# Fastapi endpoints defined here
 import os
 import time
 from typing import Any
     metrics: dict[str, float]
+# Fastapi setup
+# Fastapi allows us to define python based endpoint
+# That is called from the react based frontend
 app = FastAPI(title="RAG-AS3 API", version="0.1.0")
 app.add_middleware(
     return model_key, models[model_key]
+# On startup most of the time is spent loading chunks from pinecone
+# This is done because bm25 needs the enture corpus in memory
+# we want to avoid loading it on every query, so loading it at startup is better
+# COuld improve this as not ideal to load entire corpus in memory
+# currently it wont scale well
 @app.on_event("startup")
 def startup_event() -> None:
     load_dotenv()
     return {"status": "ok" if ready else "starting"}
+# Predict endpoint that takes a query and returns an answer along with contexts and metrics
+# is called from the frontend when user clicks submits
+# Also resolves model based on user selection
 @app.post("/predict", response_model=PredictResponse)
 def predict(payload: PredictRequest) -> PredictResponse:
     if not state:

main.py CHANGED Viewed

@@ -19,6 +19,7 @@ def main():
     # ------------------------------------------------------------------
     # 0. Configuration
     # ------------------------------------------------------------------
     hf_token        = os.getenv("HF_TOKEN")
     pinecone_api_key = os.getenv("PINECONE_API_KEY")

     # ------------------------------------------------------------------
     # 0. Configuration
+    # Query defined here
     # ------------------------------------------------------------------
     hf_token        = os.getenv("HF_TOKEN")
     pinecone_api_key = os.getenv("PINECONE_API_KEY")

query_only.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # This file is for inference without actually embedding documents
 # Main does embedding everytime, is redundant for querying.
 import os
 import time

 # This file is for inference without actually embedding documents
 # Main does embedding everytime, is redundant for querying.
+# made this just to test querying part --@Qamar
 import os
 import time