π Major performance optimizations for free tier
Browse files- Reduced context window from 4096 to 2048 tokens
- Lowered thread count from 4 to 2 to reduce resource competition
- Enabled memory mapping and low VRAM optimizations
- Reduced default max_tokens from 256 to 64
- Added strict token limits (max 128 vs 2048)
- Optimized generation parameters (lower temp, top_p, top_k)
- Added early stopping patterns to prevent long responses
- Added prompt truncation to avoid timeouts
- Graceful error handling with fallback responses
- Added /fast-chat endpoint with ultra-strict limits (32 tokens max)
- Optimized Dockerfile with performance environment variables
- Pinned dependency versions for stability
- Dockerfile +23 -8
- app/main.py +71 -10
- requirements.txt +4 -4
Dockerfile
CHANGED
@@ -1,18 +1,33 @@
|
|
1 |
FROM python:3.11-slim
|
2 |
|
3 |
-
# build
|
4 |
-
RUN apt-get update && apt-get install -y
|
5 |
-
|
|
|
|
|
|
|
6 |
|
7 |
WORKDIR /code
|
|
|
|
|
8 |
COPY requirements.txt .
|
9 |
-
RUN pip install --no-cache-dir
|
|
|
10 |
|
11 |
-
#
|
12 |
COPY . .
|
13 |
|
14 |
-
# Create model directory and set
|
15 |
-
RUN mkdir -p /code/model &&
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
EXPOSE 7860
|
18 |
-
|
|
|
|
|
|
1 |
FROM python:3.11-slim
|
2 |
|
3 |
+
# Install build dependencies with optimizations for smaller image
|
4 |
+
RUN apt-get update && apt-get install -y \
|
5 |
+
build-essential \
|
6 |
+
cmake \
|
7 |
+
&& rm -rf /var/lib/apt/lists/* \
|
8 |
+
&& apt-get clean
|
9 |
|
10 |
WORKDIR /code
|
11 |
+
|
12 |
+
# Copy and install requirements with optimizations
|
13 |
COPY requirements.txt .
|
14 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
15 |
+
pip install --no-cache-dir -r requirements.txt
|
16 |
|
17 |
+
# Copy application code
|
18 |
COPY . .
|
19 |
|
20 |
+
# Create model directory and set permissions
|
21 |
+
RUN mkdir -p /code/model && \
|
22 |
+
chmod -R 777 /code && \
|
23 |
+
find /code -type f -name "*.py" -exec chmod +x {} \;
|
24 |
+
|
25 |
+
# Optimize Python performance
|
26 |
+
ENV PYTHONUNBUFFERED=1
|
27 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
28 |
+
ENV PYTHONOPTIMIZE=1
|
29 |
|
30 |
EXPOSE 7860
|
31 |
+
|
32 |
+
# Use optimized uvicorn settings for free tier
|
33 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--loop", "asyncio", "--access-log"]
|
app/main.py
CHANGED
@@ -38,12 +38,24 @@ if not Path(MODEL_PATH).exists():
|
|
38 |
print(f"Permission denied copying to {MODEL_PATH}, using cached model directly")
|
39 |
MODEL_PATH = cached_model_path
|
40 |
|
41 |
-
#
|
42 |
-
llm = Llama(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
class Req(BaseModel):
|
45 |
prompt: str
|
46 |
-
max_tokens: int | None =
|
47 |
|
48 |
app = FastAPI(title="Phi-3 Chat API", description="A simple chat API using Phi-3 model")
|
49 |
|
@@ -73,25 +85,39 @@ def chat(r: Req):
|
|
73 |
if not r.prompt or len(r.prompt.strip()) == 0:
|
74 |
raise HTTPException(status_code=400, detail="Prompt cannot be empty")
|
75 |
|
76 |
-
#
|
77 |
if r.max_tokens is None:
|
78 |
-
r.max_tokens =
|
79 |
-
if r.max_tokens >
|
80 |
-
r.max_tokens =
|
81 |
if r.max_tokens < 1:
|
82 |
r.max_tokens = 1
|
83 |
|
|
|
|
|
|
|
|
|
84 |
logger.info(f"Processing with max_tokens={r.max_tokens}")
|
85 |
|
|
|
86 |
out = llm(
|
87 |
prompt=r.prompt,
|
88 |
max_tokens=r.max_tokens,
|
89 |
stream=False,
|
90 |
-
temperature=0.
|
91 |
-
top_p=0.
|
|
|
|
|
|
|
|
|
92 |
)
|
93 |
|
94 |
response_text = out["choices"][0]["text"].strip()
|
|
|
|
|
|
|
|
|
|
|
95 |
logger.info(f"Generated response length: {len(response_text)}")
|
96 |
|
97 |
return {"answer": response_text}
|
@@ -99,4 +125,39 @@ def chat(r: Req):
|
|
99 |
raise
|
100 |
except Exception as e:
|
101 |
logger.error(f"Error in chat endpoint: {str(e)}")
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
print(f"Permission denied copying to {MODEL_PATH}, using cached model directly")
|
39 |
MODEL_PATH = cached_model_path
|
40 |
|
41 |
+
# Load the model with optimizations for free tier
|
42 |
+
llm = Llama(
|
43 |
+
model_path=MODEL_PATH,
|
44 |
+
n_ctx=2048, # Reduced context window for speed
|
45 |
+
n_threads=2, # Reduced threads to avoid resource competition
|
46 |
+
n_batch=512, # Smaller batch size
|
47 |
+
use_mmap=True, # Use memory mapping for efficiency
|
48 |
+
use_mlock=False, # Don't lock memory (may cause issues on free tier)
|
49 |
+
low_vram=True, # Optimize for low VRAM/RAM
|
50 |
+
f16_kv=True, # Use 16-bit for key-value cache
|
51 |
+
logits_all=False, # Don't compute logits for all tokens
|
52 |
+
vocab_only=False,
|
53 |
+
verbose=False # Reduce logging overhead
|
54 |
+
)
|
55 |
|
56 |
class Req(BaseModel):
|
57 |
prompt: str
|
58 |
+
max_tokens: int | None = 64 # Much smaller default for speed
|
59 |
|
60 |
app = FastAPI(title="Phi-3 Chat API", description="A simple chat API using Phi-3 model")
|
61 |
|
|
|
85 |
if not r.prompt or len(r.prompt.strip()) == 0:
|
86 |
raise HTTPException(status_code=400, detail="Prompt cannot be empty")
|
87 |
|
88 |
+
# Strict limits for free tier performance
|
89 |
if r.max_tokens is None:
|
90 |
+
r.max_tokens = 64
|
91 |
+
if r.max_tokens > 128: # Much stricter limit
|
92 |
+
r.max_tokens = 128
|
93 |
if r.max_tokens < 1:
|
94 |
r.max_tokens = 1
|
95 |
|
96 |
+
# Truncate prompt if too long to avoid timeout
|
97 |
+
if len(r.prompt) > 500:
|
98 |
+
r.prompt = r.prompt[:500] + "..."
|
99 |
+
|
100 |
logger.info(f"Processing with max_tokens={r.max_tokens}")
|
101 |
|
102 |
+
# Optimized generation parameters for speed
|
103 |
out = llm(
|
104 |
prompt=r.prompt,
|
105 |
max_tokens=r.max_tokens,
|
106 |
stream=False,
|
107 |
+
temperature=0.3, # Lower temperature for faster, more focused responses
|
108 |
+
top_p=0.7, # More focused sampling
|
109 |
+
top_k=20, # Limit vocabulary for speed
|
110 |
+
repeat_penalty=1.1,
|
111 |
+
stop=["\n\n", "Human:", "Assistant:", "User:"], # Stop early on common patterns
|
112 |
+
echo=False # Don't echo the prompt back
|
113 |
)
|
114 |
|
115 |
response_text = out["choices"][0]["text"].strip()
|
116 |
+
|
117 |
+
# Handle empty responses
|
118 |
+
if not response_text:
|
119 |
+
response_text = "I need more context to provide a helpful response."
|
120 |
+
|
121 |
logger.info(f"Generated response length: {len(response_text)}")
|
122 |
|
123 |
return {"answer": response_text}
|
|
|
125 |
raise
|
126 |
except Exception as e:
|
127 |
logger.error(f"Error in chat endpoint: {str(e)}")
|
128 |
+
# Fallback response instead of error
|
129 |
+
return {"answer": "I'm experiencing high load. Please try a shorter message."}
|
130 |
+
|
131 |
+
@app.post("/fast-chat")
|
132 |
+
def fast_chat(r: Req):
|
133 |
+
"""Ultra-fast endpoint with very strict limits for free tier"""
|
134 |
+
try:
|
135 |
+
logger.info(f"Fast chat request: {r.prompt[:30]}...")
|
136 |
+
|
137 |
+
if not r.prompt or len(r.prompt.strip()) == 0:
|
138 |
+
return {"answer": "Please provide a message."}
|
139 |
+
|
140 |
+
# Ultra-strict limits for maximum speed
|
141 |
+
max_tokens = min(r.max_tokens or 32, 32) # Max 32 tokens
|
142 |
+
prompt = r.prompt[:200] # Max 200 chars
|
143 |
+
|
144 |
+
out = llm(
|
145 |
+
prompt=prompt,
|
146 |
+
max_tokens=max_tokens,
|
147 |
+
stream=False,
|
148 |
+
temperature=0.1, # Very low for speed
|
149 |
+
top_p=0.5,
|
150 |
+
top_k=10, # Very limited vocabulary
|
151 |
+
repeat_penalty=1.0,
|
152 |
+
stop=["\n", ".", "!", "?"], # Stop on first sentence
|
153 |
+
echo=False
|
154 |
+
)
|
155 |
+
|
156 |
+
response_text = out["choices"][0]["text"].strip()
|
157 |
+
if not response_text:
|
158 |
+
response_text = "OK"
|
159 |
+
|
160 |
+
return {"answer": response_text}
|
161 |
+
except Exception as e:
|
162 |
+
logger.error(f"Fast chat error: {str(e)}")
|
163 |
+
return {"answer": "Quick response unavailable."}
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
llama-cpp-python==0.2.*
|
2 |
-
fastapi
|
3 |
-
uvicorn[standard]
|
4 |
-
huggingface-hub
|
5 |
-
pydantic
|
|
|
1 |
llama-cpp-python==0.2.*
|
2 |
+
fastapi==0.104.*
|
3 |
+
uvicorn[standard]==0.24.*
|
4 |
+
huggingface-hub==0.19.*
|
5 |
+
pydantic==2.5.*
|