gmmpb00 commited on
Commit
4b53ea5
Β·
1 Parent(s): e46dd7b

πŸš€ Major performance optimizations for free tier

Browse files

- Reduced context window from 4096 to 2048 tokens
- Lowered thread count from 4 to 2 to reduce resource competition
- Enabled memory mapping and low VRAM optimizations
- Reduced default max_tokens from 256 to 64
- Added strict token limits (max 128 vs 2048)
- Optimized generation parameters (lower temp, top_p, top_k)
- Added early stopping patterns to prevent long responses
- Added prompt truncation to avoid timeouts
- Graceful error handling with fallback responses
- Added /fast-chat endpoint with ultra-strict limits (32 tokens max)
- Optimized Dockerfile with performance environment variables
- Pinned dependency versions for stability

Files changed (3) hide show
  1. Dockerfile +23 -8
  2. app/main.py +71 -10
  3. requirements.txt +4 -4
Dockerfile CHANGED
@@ -1,18 +1,33 @@
1
  FROM python:3.11-slim
2
 
3
- # build deps for llama-cpp-python β†’ needs gcc, g++, make, cmake
4
- RUN apt-get update && apt-get install -y build-essential cmake && \
5
- rm -rf /var/lib/apt/lists/* # keep image small :contentReference[oaicite:0]{index=0}
 
 
 
6
 
7
  WORKDIR /code
 
 
8
  COPY requirements.txt .
9
- RUN pip install --no-cache-dir -r requirements.txt # compiles llama-cpp
 
10
 
11
- # copy the rest of the repo (your app/ + model/ directory or download logic)
12
  COPY . .
13
 
14
- # Create model directory and set proper permissions for the entire /code directory
15
- RUN mkdir -p /code/model && chmod -R 777 /code
 
 
 
 
 
 
 
16
 
17
  EXPOSE 7860
18
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
1
  FROM python:3.11-slim
2
 
3
+ # Install build dependencies with optimizations for smaller image
4
+ RUN apt-get update && apt-get install -y \
5
+ build-essential \
6
+ cmake \
7
+ && rm -rf /var/lib/apt/lists/* \
8
+ && apt-get clean
9
 
10
  WORKDIR /code
11
+
12
+ # Copy and install requirements with optimizations
13
  COPY requirements.txt .
14
+ RUN pip install --no-cache-dir --upgrade pip && \
15
+ pip install --no-cache-dir -r requirements.txt
16
 
17
+ # Copy application code
18
  COPY . .
19
 
20
+ # Create model directory and set permissions
21
+ RUN mkdir -p /code/model && \
22
+ chmod -R 777 /code && \
23
+ find /code -type f -name "*.py" -exec chmod +x {} \;
24
+
25
+ # Optimize Python performance
26
+ ENV PYTHONUNBUFFERED=1
27
+ ENV PYTHONDONTWRITEBYTECODE=1
28
+ ENV PYTHONOPTIMIZE=1
29
 
30
  EXPOSE 7860
31
+
32
+ # Use optimized uvicorn settings for free tier
33
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--loop", "asyncio", "--access-log"]
app/main.py CHANGED
@@ -38,12 +38,24 @@ if not Path(MODEL_PATH).exists():
38
  print(f"Permission denied copying to {MODEL_PATH}, using cached model directly")
39
  MODEL_PATH = cached_model_path
40
 
41
- # load the instruct model
42
- llm = Llama(model_path=MODEL_PATH, n_ctx=4096, n_threads=4)
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  class Req(BaseModel):
45
  prompt: str
46
- max_tokens: int | None = 256
47
 
48
  app = FastAPI(title="Phi-3 Chat API", description="A simple chat API using Phi-3 model")
49
 
@@ -73,25 +85,39 @@ def chat(r: Req):
73
  if not r.prompt or len(r.prompt.strip()) == 0:
74
  raise HTTPException(status_code=400, detail="Prompt cannot be empty")
75
 
76
- # Ensure max_tokens is reasonable
77
  if r.max_tokens is None:
78
- r.max_tokens = 256
79
- if r.max_tokens > 2048:
80
- r.max_tokens = 2048
81
  if r.max_tokens < 1:
82
  r.max_tokens = 1
83
 
 
 
 
 
84
  logger.info(f"Processing with max_tokens={r.max_tokens}")
85
 
 
86
  out = llm(
87
  prompt=r.prompt,
88
  max_tokens=r.max_tokens,
89
  stream=False,
90
- temperature=0.7,
91
- top_p=0.9
 
 
 
 
92
  )
93
 
94
  response_text = out["choices"][0]["text"].strip()
 
 
 
 
 
95
  logger.info(f"Generated response length: {len(response_text)}")
96
 
97
  return {"answer": response_text}
@@ -99,4 +125,39 @@ def chat(r: Req):
99
  raise
100
  except Exception as e:
101
  logger.error(f"Error in chat endpoint: {str(e)}")
102
- raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  print(f"Permission denied copying to {MODEL_PATH}, using cached model directly")
39
  MODEL_PATH = cached_model_path
40
 
41
+ # Load the model with optimizations for free tier
42
+ llm = Llama(
43
+ model_path=MODEL_PATH,
44
+ n_ctx=2048, # Reduced context window for speed
45
+ n_threads=2, # Reduced threads to avoid resource competition
46
+ n_batch=512, # Smaller batch size
47
+ use_mmap=True, # Use memory mapping for efficiency
48
+ use_mlock=False, # Don't lock memory (may cause issues on free tier)
49
+ low_vram=True, # Optimize for low VRAM/RAM
50
+ f16_kv=True, # Use 16-bit for key-value cache
51
+ logits_all=False, # Don't compute logits for all tokens
52
+ vocab_only=False,
53
+ verbose=False # Reduce logging overhead
54
+ )
55
 
56
  class Req(BaseModel):
57
  prompt: str
58
+ max_tokens: int | None = 64 # Much smaller default for speed
59
 
60
  app = FastAPI(title="Phi-3 Chat API", description="A simple chat API using Phi-3 model")
61
 
 
85
  if not r.prompt or len(r.prompt.strip()) == 0:
86
  raise HTTPException(status_code=400, detail="Prompt cannot be empty")
87
 
88
+ # Strict limits for free tier performance
89
  if r.max_tokens is None:
90
+ r.max_tokens = 64
91
+ if r.max_tokens > 128: # Much stricter limit
92
+ r.max_tokens = 128
93
  if r.max_tokens < 1:
94
  r.max_tokens = 1
95
 
96
+ # Truncate prompt if too long to avoid timeout
97
+ if len(r.prompt) > 500:
98
+ r.prompt = r.prompt[:500] + "..."
99
+
100
  logger.info(f"Processing with max_tokens={r.max_tokens}")
101
 
102
+ # Optimized generation parameters for speed
103
  out = llm(
104
  prompt=r.prompt,
105
  max_tokens=r.max_tokens,
106
  stream=False,
107
+ temperature=0.3, # Lower temperature for faster, more focused responses
108
+ top_p=0.7, # More focused sampling
109
+ top_k=20, # Limit vocabulary for speed
110
+ repeat_penalty=1.1,
111
+ stop=["\n\n", "Human:", "Assistant:", "User:"], # Stop early on common patterns
112
+ echo=False # Don't echo the prompt back
113
  )
114
 
115
  response_text = out["choices"][0]["text"].strip()
116
+
117
+ # Handle empty responses
118
+ if not response_text:
119
+ response_text = "I need more context to provide a helpful response."
120
+
121
  logger.info(f"Generated response length: {len(response_text)}")
122
 
123
  return {"answer": response_text}
 
125
  raise
126
  except Exception as e:
127
  logger.error(f"Error in chat endpoint: {str(e)}")
128
+ # Fallback response instead of error
129
+ return {"answer": "I'm experiencing high load. Please try a shorter message."}
130
+
131
+ @app.post("/fast-chat")
132
+ def fast_chat(r: Req):
133
+ """Ultra-fast endpoint with very strict limits for free tier"""
134
+ try:
135
+ logger.info(f"Fast chat request: {r.prompt[:30]}...")
136
+
137
+ if not r.prompt or len(r.prompt.strip()) == 0:
138
+ return {"answer": "Please provide a message."}
139
+
140
+ # Ultra-strict limits for maximum speed
141
+ max_tokens = min(r.max_tokens or 32, 32) # Max 32 tokens
142
+ prompt = r.prompt[:200] # Max 200 chars
143
+
144
+ out = llm(
145
+ prompt=prompt,
146
+ max_tokens=max_tokens,
147
+ stream=False,
148
+ temperature=0.1, # Very low for speed
149
+ top_p=0.5,
150
+ top_k=10, # Very limited vocabulary
151
+ repeat_penalty=1.0,
152
+ stop=["\n", ".", "!", "?"], # Stop on first sentence
153
+ echo=False
154
+ )
155
+
156
+ response_text = out["choices"][0]["text"].strip()
157
+ if not response_text:
158
+ response_text = "OK"
159
+
160
+ return {"answer": response_text}
161
+ except Exception as e:
162
+ logger.error(f"Fast chat error: {str(e)}")
163
+ return {"answer": "Quick response unavailable."}
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  llama-cpp-python==0.2.*
2
- fastapi
3
- uvicorn[standard]
4
- huggingface-hub
5
- pydantic
 
1
  llama-cpp-python==0.2.*
2
+ fastapi==0.104.*
3
+ uvicorn[standard]==0.24.*
4
+ huggingface-hub==0.19.*
5
+ pydantic==2.5.*