Spaces:

SmartRetrieval
/

Smart-Retrieval-Demo-API

Build error

khronoz commited on Jan 26, 2024

Commit

afc8144

1 Parent(s): 8500091

Add CORS for prod & checking if CUDA is available before loading model

Files changed (2) hide show

backend/backend/app/utils/index.py CHANGED Viewed

@@ -34,6 +34,10 @@ DATA_DIR = str(
     current_directory / "data"
 )  # directory containing the documents to index
 llm = LlamaCPP(
     model_url="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf",
     temperature=0.1,
@@ -43,8 +47,7 @@ llm = LlamaCPP(
     # kwargs to pass to __call__()
     # generate_kwargs={},
     # kwargs to pass to __init__()
-    # set to at least 1 to use GPU, adjust according to your GPU memory, but must be able to fit the model
-    model_kwargs={"n_gpu_layers": 100},
     # transform inputs into Llama2 format
     messages_to_prompt=messages_to_prompt,
     completion_to_prompt=completion_to_prompt,

     current_directory / "data"
 )  # directory containing the documents to index
+# set to at least 1 to use GPU, adjust according to your GPU memory, but must be able to fit the model
+model_kwargs = {"n_gpu_layers": 100} if DEVICE_TYPE == "cuda" else {}
 llm = LlamaCPP(
     model_url="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf",
     temperature=0.1,
     # kwargs to pass to __call__()
     # generate_kwargs={},
     # kwargs to pass to __init__()
+    model_kwargs=model_kwargs,
     # transform inputs into Llama2 format
     messages_to_prompt=messages_to_prompt,
     completion_to_prompt=completion_to_prompt,

backend/backend/main.py CHANGED Viewed

@@ -9,6 +9,7 @@ from app.utils.index import create_index
 from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 load_dotenv()
@@ -16,6 +17,7 @@ app = FastAPI()
 environment = os.getenv("ENVIRONMENT", "dev")  # Default to 'development' if not set
 if environment == "dev":
     logger = logging.getLogger("uvicorn")
@@ -28,10 +30,30 @@ if environment == "dev":
         allow_headers=["*"],
     )
 app.include_router(chat_router, prefix="/api/chat")
 app.include_router(query_router, prefix="/api/query")
 app.include_router(search_router, prefix="/api/search")
 app.include_router(healthcheck_router, prefix="/api/healthcheck")
-# try to create the index first on startup
 create_index()

 from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from torch.cuda import is_available as is_cuda_available
 load_dotenv()
 environment = os.getenv("ENVIRONMENT", "dev")  # Default to 'development' if not set
+# TODO: Add reading allowed origins from environment variables
 if environment == "dev":
     logger = logging.getLogger("uvicorn")
         allow_headers=["*"],
     )
+if environment == "prod":
+    # In production, specify the allowed origins
+    allowed_origins = [
+        "https://your-production-domain.com",
+        "https://another-production-domain.com",
+        # Add more allowed origins as needed
+    ]
+    logger = logging.getLogger("uvicorn")
+    logger.info(f"Running in production mode - allowing CORS for {allowed_origins}")
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=allowed_origins,
+        allow_credentials=True,
+        allow_methods=["GET", "POST", "PUT", "DELETE"],
+        allow_headers=["*"],
+    )
+logger.info(f"CUDA available: {is_cuda_available()}")
 app.include_router(chat_router, prefix="/api/chat")
 app.include_router(query_router, prefix="/api/query")
 app.include_router(search_router, prefix="/api/search")
 app.include_router(healthcheck_router, prefix="/api/healthcheck")
+# Try to create the index first on startup
 create_index()