Spaces:

markgm
/

wls

Sleeping

gmmpb00 commited on May 25

Commit

4b53ea5

1 Parent(s): e46dd7b

🚀 Major performance optimizations for free tier

- Reduced context window from 4096 to 2048 tokens
- Lowered thread count from 4 to 2 to reduce resource competition
- Enabled memory mapping and low VRAM optimizations
- Reduced default max_tokens from 256 to 64
- Added strict token limits (max 128 vs 2048)
- Optimized generation parameters (lower temp, top_p, top_k)
- Added early stopping patterns to prevent long responses
- Added prompt truncation to avoid timeouts
- Graceful error handling with fallback responses
- Added /fast-chat endpoint with ultra-strict limits (32 tokens max)
- Optimized Dockerfile with performance environment variables
- Pinned dependency versions for stability

Files changed (3) hide show

Dockerfile +23 -8
app/main.py +71 -10
requirements.txt +4 -4

Dockerfile CHANGED Viewed

@@ -1,18 +1,33 @@
 FROM python:3.11-slim
-# build deps for llama-cpp-python → needs gcc, g++, make, cmake
-RUN apt-get update && apt-get install -y build-essential cmake && \
-    rm -rf /var/lib/apt/lists/*   # keep image small :contentReference[oaicite:0]{index=0}
 WORKDIR /code
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt      # compiles llama-cpp
-# copy the rest of the repo (your app/  +  model/ directory or download logic)
 COPY . .
-# Create model directory and set proper permissions for the entire /code directory
-RUN mkdir -p /code/model && chmod -R 777 /code
 EXPOSE 7860
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.11-slim
+# Install build dependencies with optimizations for smaller image
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
 WORKDIR /code
+# Copy and install requirements with optimizations
 COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
 COPY . .
+# Create model directory and set permissions
+RUN mkdir -p /code/model && \
+    chmod -R 777 /code && \
+    find /code -type f -name "*.py" -exec chmod +x {} \;
+# Optimize Python performance
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONOPTIMIZE=1
 EXPOSE 7860
+# Use optimized uvicorn settings for free tier
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--loop", "asyncio", "--access-log"]

app/main.py CHANGED Viewed

@@ -38,12 +38,24 @@ if not Path(MODEL_PATH).exists():
         print(f"Permission denied copying to {MODEL_PATH}, using cached model directly")
         MODEL_PATH = cached_model_path
-# load the instruct model
-llm = Llama(model_path=MODEL_PATH, n_ctx=4096, n_threads=4)
 class Req(BaseModel):
     prompt: str
-    max_tokens: int | None = 256
 app = FastAPI(title="Phi-3 Chat API", description="A simple chat API using Phi-3 model")
@@ -73,25 +85,39 @@ def chat(r: Req):
         if not r.prompt or len(r.prompt.strip()) == 0:
             raise HTTPException(status_code=400, detail="Prompt cannot be empty")
-        # Ensure max_tokens is reasonable
         if r.max_tokens is None:
-            r.max_tokens = 256
-        if r.max_tokens > 2048:
-            r.max_tokens = 2048
         if r.max_tokens < 1:
             r.max_tokens = 1
         logger.info(f"Processing with max_tokens={r.max_tokens}")
         out = llm(
             prompt=r.prompt,
             max_tokens=r.max_tokens,
             stream=False,
-            temperature=0.7,
-            top_p=0.9
         )
         response_text = out["choices"][0]["text"].strip()
         logger.info(f"Generated response length: {len(response_text)}")
         return {"answer": response_text}
@@ -99,4 +125,39 @@ def chat(r: Req):
         raise
     except Exception as e:
         logger.error(f"Error in chat endpoint: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")

         print(f"Permission denied copying to {MODEL_PATH}, using cached model directly")
         MODEL_PATH = cached_model_path
+# Load the model with optimizations for free tier
+llm = Llama(
+    model_path=MODEL_PATH,
+    n_ctx=2048,  # Reduced context window for speed
+    n_threads=2,  # Reduced threads to avoid resource competition
+    n_batch=512,  # Smaller batch size
+    use_mmap=True,  # Use memory mapping for efficiency
+    use_mlock=False,  # Don't lock memory (may cause issues on free tier)
+    low_vram=True,  # Optimize for low VRAM/RAM
+    f16_kv=True,  # Use 16-bit for key-value cache
+    logits_all=False,  # Don't compute logits for all tokens
+    vocab_only=False,
+    verbose=False  # Reduce logging overhead
+)
 class Req(BaseModel):
     prompt: str
+    max_tokens: int | None = 64  # Much smaller default for speed
 app = FastAPI(title="Phi-3 Chat API", description="A simple chat API using Phi-3 model")
         if not r.prompt or len(r.prompt.strip()) == 0:
             raise HTTPException(status_code=400, detail="Prompt cannot be empty")
+        # Strict limits for free tier performance
         if r.max_tokens is None:
+            r.max_tokens = 64
+        if r.max_tokens > 128:  # Much stricter limit
+            r.max_tokens = 128
         if r.max_tokens < 1:
             r.max_tokens = 1
+        # Truncate prompt if too long to avoid timeout
+        if len(r.prompt) > 500:
+            r.prompt = r.prompt[:500] + "..."
         logger.info(f"Processing with max_tokens={r.max_tokens}")
+        # Optimized generation parameters for speed
         out = llm(
             prompt=r.prompt,
             max_tokens=r.max_tokens,
             stream=False,
+            temperature=0.3,  # Lower temperature for faster, more focused responses
+            top_p=0.7,  # More focused sampling
+            top_k=20,  # Limit vocabulary for speed
+            repeat_penalty=1.1,
+            stop=["\n\n", "Human:", "Assistant:", "User:"],  # Stop early on common patterns
+            echo=False  # Don't echo the prompt back
         )
         response_text = out["choices"][0]["text"].strip()
+        # Handle empty responses
+        if not response_text:
+            response_text = "I need more context to provide a helpful response."
         logger.info(f"Generated response length: {len(response_text)}")
         return {"answer": response_text}
         raise
     except Exception as e:
         logger.error(f"Error in chat endpoint: {str(e)}")
+        # Fallback response instead of error
+        return {"answer": "I'm experiencing high load. Please try a shorter message."}
+@app.post("/fast-chat")
+def fast_chat(r: Req):
+    """Ultra-fast endpoint with very strict limits for free tier"""
+    try:
+        logger.info(f"Fast chat request: {r.prompt[:30]}...")
+        if not r.prompt or len(r.prompt.strip()) == 0:
+            return {"answer": "Please provide a message."}
+        # Ultra-strict limits for maximum speed
+        max_tokens = min(r.max_tokens or 32, 32)  # Max 32 tokens
+        prompt = r.prompt[:200]  # Max 200 chars
+        out = llm(
+            prompt=prompt,
+            max_tokens=max_tokens,
+            stream=False,
+            temperature=0.1,  # Very low for speed
+            top_p=0.5,
+            top_k=10,  # Very limited vocabulary
+            repeat_penalty=1.0,
+            stop=["\n", ".", "!", "?"],  # Stop on first sentence
+            echo=False
+        )
+        response_text = out["choices"][0]["text"].strip()
+        if not response_text:
+            response_text = "OK"
+        return {"answer": response_text}
+    except Exception as e:
+        logger.error(f"Fast chat error: {str(e)}")
+        return {"answer": "Quick response unavailable."}

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 llama-cpp-python==0.2.*
-fastapi
-uvicorn[standard]
-huggingface-hub
-pydantic

 llama-cpp-python==0.2.*
+fastapi==0.104.*
+uvicorn[standard]==0.24.*
+huggingface-hub==0.19.*
+pydantic==2.5.*