Spaces:

LegendDeep
/

Gemma-3-270M

Sleeping

App Files Files Community

unknown commited on Oct 23

Commit

8ecbd6b

1 Parent(s): e06bc75

Fixed the model optimzation speed

Browse files

Files changed (2) hide show

app.py +22 -25
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -6,14 +6,13 @@ from typing import List, Optional, Dict, Any
 from fastapi import FastAPI, HTTPException, Request, status
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
-from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor
 # -------------------------
 # Configuration (via env)
 # -------------------------
 REPO_ID = os.getenv("REPO_ID", "unsloth/gemma-3-270m-it-GGUF")
-MODEL_FILENAME = os.getenv("MODEL_FILENAME", "gemma-3-270m-it-F16.gguf")
 MAX_WORKERS = int(os.getenv("MAX_WORKERS", "2"))          # ThreadPool workers (reduced for speed)
 MAX_CONCURRENT_REQUESTS = int(os.getenv("MAX_CONCURRENT_REQUESTS", "1"))  # Reduced for speed
 RATE_LIMIT_PER_MIN = int(os.getenv("RATE_LIMIT_PER_MIN", "60"))
@@ -70,7 +69,7 @@ class GenerationResponse(BaseModel):
 # -------------------------
 # Global objects
 # -------------------------
-LLM_MODEL: Optional[Llama] = None
 executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 model_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
@@ -108,20 +107,21 @@ rate_limiter = RateLimiter(RATE_LIMIT_PER_MIN)
 # build_prompt_from_messages function removed - using chat completion format directly
 def generate_sync(messages: List[Dict[str, str]], max_new_tokens: int, temperature: float, top_p: float, do_sample: bool, num_beams: int = 1, early_stopping: bool = True, use_cache: bool = True) -> str:
-    # llama-cpp-python generation parameters
     generation_kwargs = {
-        "max_tokens": max_new_tokens,
         "temperature": temperature,
         "top_p": top_p,
     }
-    # Create chat completion using llama-cpp-python
-    response = LLM_MODEL.create_chat_completion(
-        messages=messages,
-        **generation_kwargs
-    )
-    return response["choices"][0]["message"]["content"]
 async def generate_async(messages: List[Dict[str, str]], max_new_tokens: int, temperature: float, top_p: float, do_sample: bool, num_beams: int = 1, early_stopping: bool = True, use_cache: bool = True) -> str:
     loop = asyncio.get_event_loop()
@@ -138,28 +138,25 @@ async def on_startup():
     global LLM_MODEL
     try:
-        logger.info(f"Loading model from {REPO_ID}/{MODEL_FILENAME}...")
-        LLM_MODEL = Llama.from_pretrained(
-            repo_id=REPO_ID,
-            filename=MODEL_FILENAME,
-            n_ctx=N_CTX,
-            n_threads=N_THREADS,
-            n_gpu_layers=N_GPU_LAYERS,
-            verbose=False
         )
         logger.info("Model loaded successfully.")
         # Warm up the model with a dummy request for faster first inference
         logger.info("Warming up model...")
         dummy_messages = [{"role": "user", "content": "Hello"}]
-        _ = LLM_MODEL.create_chat_completion(
-            messages=dummy_messages,
-            max_tokens=5,
             temperature=0.1
         )
         logger.info("Model warmed up successfully.")
     except Exception as e:
-        logger.error(f"Failed to load model {REPO_ID}/{MODEL_FILENAME}: {e}")
         raise RuntimeError(f"Model loading failed: {e}") from e
 # -------------------------
@@ -167,7 +164,7 @@ async def on_startup():
 # -------------------------
 @app.get("/")
 async def root():
-    return {"status": "Gemma 3 API is running 🎉", "model": f"{REPO_ID}/{MODEL_FILENAME}"}
 @app.get("/health")
 async def health():
@@ -220,6 +217,6 @@ async def generate(req: GenerationRequest, request: Request):
     return GenerationResponse(
         generated_text=generated_text,
-        model=f"{REPO_ID}/{MODEL_FILENAME}",
         runtime_seconds=round(runtime, 3)
     )

 from fastapi import FastAPI, HTTPException, Request, status
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
+from transformers import pipeline
 from concurrent.futures import ThreadPoolExecutor
 # -------------------------
 # Configuration (via env)
 # -------------------------
 REPO_ID = os.getenv("REPO_ID", "unsloth/gemma-3-270m-it-GGUF")
 MAX_WORKERS = int(os.getenv("MAX_WORKERS", "2"))          # ThreadPool workers (reduced for speed)
 MAX_CONCURRENT_REQUESTS = int(os.getenv("MAX_CONCURRENT_REQUESTS", "1"))  # Reduced for speed
 RATE_LIMIT_PER_MIN = int(os.getenv("RATE_LIMIT_PER_MIN", "60"))
 # -------------------------
 # Global objects
 # -------------------------
+LLM_MODEL: Optional[Any] = None
 executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
 model_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
 # build_prompt_from_messages function removed - using chat completion format directly
 def generate_sync(messages: List[Dict[str, str]], max_new_tokens: int, temperature: float, top_p: float, do_sample: bool, num_beams: int = 1, early_stopping: bool = True, use_cache: bool = True) -> str:
+    # transformers pipeline generation parameters
     generation_kwargs = {
+        "max_new_tokens": max_new_tokens,
         "temperature": temperature,
         "top_p": top_p,
+        "do_sample": do_sample,
+        "num_beams": num_beams,
+        "early_stopping": early_stopping,
+        "use_cache": use_cache,
     }
+    # Generate using transformers pipeline
+    response = LLM_MODEL(messages, **generation_kwargs)
+    return response[0]["generated_text"][-1]["content"] if isinstance(response[0]["generated_text"], list) else response[0]["generated_text"]
 async def generate_async(messages: List[Dict[str, str]], max_new_tokens: int, temperature: float, top_p: float, do_sample: bool, num_beams: int = 1, early_stopping: bool = True, use_cache: bool = True) -> str:
     loop = asyncio.get_event_loop()
     global LLM_MODEL
     try:
+        logger.info(f"Loading model from {REPO_ID}...")
+        LLM_MODEL = pipeline(
+            "text-generation",
+            model=REPO_ID,
+            device_map="auto" if N_GPU_LAYERS > 0 else "cpu"
         )
         logger.info("Model loaded successfully.")
         # Warm up the model with a dummy request for faster first inference
         logger.info("Warming up model...")
         dummy_messages = [{"role": "user", "content": "Hello"}]
+        _ = LLM_MODEL(
+            dummy_messages,
+            max_new_tokens=5,
             temperature=0.1
         )
         logger.info("Model warmed up successfully.")
     except Exception as e:
+        logger.error(f"Failed to load model {REPO_ID}: {e}")
         raise RuntimeError(f"Model loading failed: {e}") from e
 # -------------------------
 # -------------------------
 @app.get("/")
 async def root():
+    return {"status": "Gemma 3 API is running 🎉", "model": REPO_ID}
 @app.get("/health")
 async def health():
     return GenerationResponse(
         generated_text=generated_text,
+        model=REPO_ID,
         runtime_seconds=round(runtime, 3)
     )

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 fastapi
 uvicorn
-llama-cpp-python
 pydantic
 python-multipart
 starlette

 fastapi
 uvicorn
+transformers
+torch
 pydantic
 python-multipart
 starlette