triflix commited on
Commit
d786f9d
·
verified ·
1 Parent(s): 6d00460

Create script.sh

Browse files
Files changed (1) hide show
  1. script.sh +244 -0
script.sh ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ set -e
3
+
4
+ echo "=========================================="
5
+ echo "Phi-4-mini-instruct Agentic Model Server"
6
+ echo "=========================================="
7
+ echo ""
8
+
9
+ # Configuration
10
+ MODEL_REPO="${MODEL_REPO:-unsloth/Phi-4-mini-instruct-GGUF}"
11
+ MODEL_FILE="${MODEL_FILE:-Phi-4-mini-instruct-Q4_K_M.gguf}"
12
+ N_CTX="${N_CTX:-8192}"
13
+ N_THREADS="${N_THREADS:-2}"
14
+ HOST="${HOST:-0.0.0.0}"
15
+ PORT="${PORT:-7860}"
16
+ MODEL_DIR="${MODEL_DIR:-/app/models}"
17
+
18
+ echo "Configuration:"
19
+ echo " Model Repo: $MODEL_REPO"
20
+ echo " Model File: $MODEL_FILE"
21
+ echo " Context Length: $N_CTX"
22
+ echo " Threads: $N_THREADS"
23
+ echo " Host: $HOST"
24
+ echo " Port: $PORT"
25
+ echo ""
26
+
27
+ # Create model directory
28
+ mkdir -p "$MODEL_DIR"
29
+
30
+ # Download model if not exists
31
+ MODEL_PATH="$MODEL_DIR/$MODEL_FILE"
32
+ if [ ! -f "$MODEL_PATH" ]; then
33
+ echo "Downloading model from HuggingFace..."
34
+ echo "This may take a few minutes on first run..."
35
+ python3 << 'PYTHON_DOWNLOAD'
36
+ import os
37
+ from huggingface_hub import hf_hub_download
38
+
39
+ repo = os.environ.get("MODEL_REPO", "unsloth/Phi-4-mini-instruct-GGUF")
40
+ filename = os.environ.get("MODEL_FILE", "Phi-4-mini-instruct-Q4_K_M.gguf")
41
+ model_dir = os.environ.get("MODEL_DIR", "/app/models")
42
+
43
+ print(f"Downloading {filename} from {repo}...")
44
+ model_path = hf_hub_download(
45
+ repo_id=repo,
46
+ filename=filename,
47
+ local_dir=model_dir,
48
+ local_dir_use_symlinks=False
49
+ )
50
+ print(f"Model saved to: {model_path}")
51
+ PYTHON_DOWNLOAD
52
+ echo "Model download complete!"
53
+ else
54
+ echo "Model already cached at: $MODEL_PATH"
55
+ fi
56
+
57
+ echo ""
58
+ echo "Starting Phi-4-mini-instruct server on port $PORT..."
59
+ echo "OpenAI-compatible API endpoints available:"
60
+ echo " - POST /v1/chat/completions"
61
+ echo " - POST /v1/completions"
62
+ echo " - GET /v1/models"
63
+ echo " - GET /health"
64
+ echo ""
65
+ echo "Tool calling example:"
66
+ echo ' curl -X POST http://localhost:7860/v1/chat/completions \\'
67
+ echo ' -H "Content-Type: application/json" \\'
68
+ echo ' -d '"'"'{"model": "phi-4-mini", "messages": [...], "tools": [...]}'"'"''
69
+ echo ""
70
+
71
+ # Start the FastAPI server
72
+ python3 << 'PYTHON_SERVER'
73
+ import os
74
+ import json
75
+ import time
76
+ from typing import List, Optional, Dict, Any, Union
77
+ from contextlib import asynccontextmanager
78
+
79
+ from fastapi import FastAPI, HTTPException
80
+ from fastapi.responses import StreamingResponse, JSONResponse
81
+ from pydantic import BaseModel, Field
82
+ import uvicorn
83
+ from llama_cpp import Llama
84
+
85
+ # Configuration
86
+ MODEL_PATH = os.environ.get("MODEL_DIR", "/app/models") + "/" + os.environ.get("MODEL_FILE", "Phi-4-mini-instruct-Q4_K_M.gguf")
87
+ N_CTX = int(os.environ.get("N_CTX", "8192"))
88
+ N_THREADS = int(os.environ.get("N_THREADS", "2"))
89
+
90
+ # Global model instance
91
+ llm = None
92
+
93
+ @asynccontextmanager
94
+ async def lifespan(app: FastAPI):
95
+ global llm
96
+ print(f"Loading model from: {MODEL_PATH}")
97
+ print(f"Context length: {N_CTX}, Threads: {N_THREADS}")
98
+
99
+ llm = Llama(
100
+ model_path=MODEL_PATH,
101
+ n_ctx=N_CTX,
102
+ n_threads=N_THREADS,
103
+ verbose=False
104
+ )
105
+ print("Model loaded successfully!")
106
+ yield
107
+ print("Shutting down...")
108
+
109
+ app = FastAPI(
110
+ title="Phi-4-mini-instruct API",
111
+ description="OpenAI-compatible API for Phi-4-mini-instruct with tool calling support",
112
+ version="1.0.0",
113
+ lifespan=lifespan
114
+ )
115
+
116
+ # Pydantic models for OpenAI compatibility
117
+ class ChatMessage(BaseModel):
118
+ role: str
119
+ content: Optional[str] = None
120
+ name: Optional[str] = None
121
+ tool_calls: Optional[List[Dict]] = None
122
+ tool_call_id: Optional[str] = None
123
+
124
+ class ToolFunction(BaseModel):
125
+ name: str
126
+ description: Optional[str] = ""
127
+ parameters: Optional[Dict] = {}
128
+
129
+ class Tool(BaseModel):
130
+ type: str = "function"
131
+ function: ToolFunction
132
+
133
+ class ChatCompletionRequest(BaseModel):
134
+ model: str = "phi-4-mini"
135
+ messages: List[ChatMessage]
136
+ tools: Optional[List[Tool]] = None
137
+ tool_choice: Optional[Union[str, Dict]] = "auto"
138
+ temperature: Optional[float] = 0.7
139
+ max_tokens: Optional[int] = 2048
140
+ stream: Optional[bool] = False
141
+
142
+ class ModelInfo(BaseModel):
143
+ id: str
144
+ object: str = "model"
145
+ created: int
146
+ owned_by: str = "microsoft"
147
+
148
+ @app.get("/health")
149
+ async def health_check():
150
+ return {"status": "healthy", "model_loaded": llm is not None}
151
+
152
+ @app.get("/v1/models")
153
+ async def list_models():
154
+ return {
155
+ "object": "list",
156
+ "data": [
157
+ {
158
+ "id": "phi-4-mini",
159
+ "object": "model",
160
+ "created": int(time.time()),
161
+ "owned_by": "microsoft"
162
+ }
163
+ ]
164
+ }
165
+
166
+ @app.post("/v1/chat/completions")
167
+ async def chat_completions(request: ChatCompletionRequest):
168
+ if llm is None:
169
+ raise HTTPException(status_code=503, detail="Model not loaded")
170
+
171
+ # Convert messages to llama.cpp format
172
+ messages = []
173
+ for msg in request.messages:
174
+ message = {"role": msg.role, "content": msg.content or ""}
175
+ if msg.tool_calls:
176
+ message["tool_calls"] = msg.tool_calls
177
+ if msg.tool_call_id:
178
+ message["tool_call_id"] = msg.tool_call_id
179
+ messages.append(message)
180
+
181
+ # Prepare tools if provided
182
+ tools = None
183
+ if request.tools:
184
+ tools = [t.model_dump() for t in request.tools]
185
+
186
+ try:
187
+ response = llm.create_chat_completion(
188
+ messages=messages,
189
+ tools=tools,
190
+ tool_choice=request.tool_choice if tools else None,
191
+ temperature=request.temperature,
192
+ max_tokens=request.max_tokens,
193
+ stream=request.stream
194
+ )
195
+
196
+ if request.stream:
197
+ async def generate():
198
+ for chunk in response:
199
+ yield f"data: {json.dumps(chunk)}\n\n"
200
+ yield "data: [DONE]\n\n"
201
+ return StreamingResponse(generate(), media_type="text/event-stream")
202
+
203
+ return JSONResponse(content=response)
204
+
205
+ except Exception as e:
206
+ raise HTTPException(status_code=500, detail=str(e))
207
+
208
+ @app.post("/v1/completions")
209
+ async def completions(request: dict):
210
+ if llm is None:
211
+ raise HTTPException(status_code=503, detail="Model not loaded")
212
+
213
+ prompt = request.get("prompt", "")
214
+ max_tokens = request.get("max_tokens", 2048)
215
+ temperature = request.get("temperature", 0.7)
216
+ stream = request.get("stream", False)
217
+
218
+ try:
219
+ response = llm(
220
+ prompt=prompt,
221
+ max_tokens=max_tokens,
222
+ temperature=temperature,
223
+ stream=stream
224
+ )
225
+
226
+ if stream:
227
+ async def generate():
228
+ for chunk in response:
229
+ yield f"data: {json.dumps(chunk)}\n\n"
230
+ yield "data: [DONE]\n\n"
231
+ return StreamingResponse(generate(), media_type="text/event-stream")
232
+
233
+ return JSONResponse(content=response)
234
+
235
+ except Exception as e:
236
+ raise HTTPException(status_code=500, detail=str(e))
237
+
238
+ if __name__ == "__main__":
239
+ uvicorn.run(
240
+ app,
241
+ host=os.environ.get("HOST", "0.0.0.0"),
242
+ port=int(os.environ.get("PORT", 7860))
243
+ )
244
+ PYTHON_SERVER