added time metrics
Browse files- .gitignore +1 -1
- api.py +17 -0
- main.py +1 -0
- query_only.py +1 -2
.gitignore
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
| 4 |
*$py.class
|
|
|
|
| 1 |
+
# python specific ignores
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
| 4 |
*$py.class
|
api.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import time
|
| 3 |
from typing import Any
|
|
@@ -38,6 +39,11 @@ class PredictResponse(BaseModel):
|
|
| 38 |
metrics: dict[str, float]
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
app = FastAPI(title="RAG-AS3 API", version="0.1.0")
|
| 42 |
|
| 43 |
app.add_middleware(
|
|
@@ -77,6 +83,13 @@ def _resolve_model(name: str, models: dict[str, Any]) -> tuple[str, Any]:
|
|
| 77 |
return model_key, models[model_key]
|
| 78 |
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
@app.on_event("startup")
|
| 81 |
def startup_event() -> None:
|
| 82 |
load_dotenv()
|
|
@@ -131,6 +144,10 @@ def health() -> dict[str, str]:
|
|
| 131 |
return {"status": "ok" if ready else "starting"}
|
| 132 |
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
@app.post("/predict", response_model=PredictResponse)
|
| 135 |
def predict(payload: PredictRequest) -> PredictResponse:
|
| 136 |
if not state:
|
|
|
|
| 1 |
+
# Fastapi endpoints defined here
|
| 2 |
import os
|
| 3 |
import time
|
| 4 |
from typing import Any
|
|
|
|
| 39 |
metrics: dict[str, float]
|
| 40 |
|
| 41 |
|
| 42 |
+
|
| 43 |
+
# Fastapi setup
|
| 44 |
+
# Fastapi allows us to define python based endpoint
|
| 45 |
+
# That is called from the react based frontend
|
| 46 |
+
|
| 47 |
app = FastAPI(title="RAG-AS3 API", version="0.1.0")
|
| 48 |
|
| 49 |
app.add_middleware(
|
|
|
|
| 83 |
return model_key, models[model_key]
|
| 84 |
|
| 85 |
|
| 86 |
+
# On startup most of the time is spent loading chunks from pinecone
|
| 87 |
+
# This is done because bm25 needs the enture corpus in memory
|
| 88 |
+
# we want to avoid loading it on every query, so loading it at startup is better
|
| 89 |
+
|
| 90 |
+
# COuld improve this as not ideal to load entire corpus in memory
|
| 91 |
+
# currently it wont scale well
|
| 92 |
+
|
| 93 |
@app.on_event("startup")
|
| 94 |
def startup_event() -> None:
|
| 95 |
load_dotenv()
|
|
|
|
| 144 |
return {"status": "ok" if ready else "starting"}
|
| 145 |
|
| 146 |
|
| 147 |
+
|
| 148 |
+
# Predict endpoint that takes a query and returns an answer along with contexts and metrics
|
| 149 |
+
# is called from the frontend when user clicks submits
|
| 150 |
+
# Also resolves model based on user selection
|
| 151 |
@app.post("/predict", response_model=PredictResponse)
|
| 152 |
def predict(payload: PredictRequest) -> PredictResponse:
|
| 153 |
if not state:
|
main.py
CHANGED
|
@@ -19,6 +19,7 @@ def main():
|
|
| 19 |
|
| 20 |
# ------------------------------------------------------------------
|
| 21 |
# 0. Configuration
|
|
|
|
| 22 |
# ------------------------------------------------------------------
|
| 23 |
hf_token = os.getenv("HF_TOKEN")
|
| 24 |
pinecone_api_key = os.getenv("PINECONE_API_KEY")
|
|
|
|
| 19 |
|
| 20 |
# ------------------------------------------------------------------
|
| 21 |
# 0. Configuration
|
| 22 |
+
# Query defined here
|
| 23 |
# ------------------------------------------------------------------
|
| 24 |
hf_token = os.getenv("HF_TOKEN")
|
| 25 |
pinecone_api_key = os.getenv("PINECONE_API_KEY")
|
query_only.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# This file is for inference without actually embedding documents
|
| 2 |
# Main does embedding everytime, is redundant for querying.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
|
| 6 |
import os
|
| 7 |
import time
|
|
|
|
| 1 |
# This file is for inference without actually embedding documents
|
| 2 |
# Main does embedding everytime, is redundant for querying.
|
| 3 |
+
# made this just to test querying part --@Qamar
|
|
|
|
| 4 |
|
| 5 |
import os
|
| 6 |
import time
|