modernLLM / script.sh
triflix's picture
Create script.sh
d786f9d verified
set -e
echo "=========================================="
echo "Phi-4-mini-instruct Agentic Model Server"
echo "=========================================="
echo ""
# Configuration
MODEL_REPO="${MODEL_REPO:-unsloth/Phi-4-mini-instruct-GGUF}"
MODEL_FILE="${MODEL_FILE:-Phi-4-mini-instruct-Q4_K_M.gguf}"
N_CTX="${N_CTX:-8192}"
N_THREADS="${N_THREADS:-2}"
HOST="${HOST:-0.0.0.0}"
PORT="${PORT:-7860}"
MODEL_DIR="${MODEL_DIR:-/app/models}"
echo "Configuration:"
echo " Model Repo: $MODEL_REPO"
echo " Model File: $MODEL_FILE"
echo " Context Length: $N_CTX"
echo " Threads: $N_THREADS"
echo " Host: $HOST"
echo " Port: $PORT"
echo ""
# Create model directory
mkdir -p "$MODEL_DIR"
# Download model if not exists
MODEL_PATH="$MODEL_DIR/$MODEL_FILE"
if [ ! -f "$MODEL_PATH" ]; then
echo "Downloading model from HuggingFace..."
echo "This may take a few minutes on first run..."
python3 << 'PYTHON_DOWNLOAD'
import os
from huggingface_hub import hf_hub_download
repo = os.environ.get("MODEL_REPO", "unsloth/Phi-4-mini-instruct-GGUF")
filename = os.environ.get("MODEL_FILE", "Phi-4-mini-instruct-Q4_K_M.gguf")
model_dir = os.environ.get("MODEL_DIR", "/app/models")
print(f"Downloading {filename} from {repo}...")
model_path = hf_hub_download(
repo_id=repo,
filename=filename,
local_dir=model_dir,
local_dir_use_symlinks=False
)
print(f"Model saved to: {model_path}")
PYTHON_DOWNLOAD
echo "Model download complete!"
else
echo "Model already cached at: $MODEL_PATH"
fi
echo ""
echo "Starting Phi-4-mini-instruct server on port $PORT..."
echo "OpenAI-compatible API endpoints available:"
echo " - POST /v1/chat/completions"
echo " - POST /v1/completions"
echo " - GET /v1/models"
echo " - GET /health"
echo ""
echo "Tool calling example:"
echo ' curl -X POST http://localhost:7860/v1/chat/completions \\'
echo ' -H "Content-Type: application/json" \\'
echo ' -d '"'"'{"model": "phi-4-mini", "messages": [...], "tools": [...]}'"'"''
echo ""
# Start the FastAPI server
python3 << 'PYTHON_SERVER'
import os
import json
import time
from typing import List, Optional, Dict, Any, Union
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel, Field
import uvicorn
from llama_cpp import Llama
# Configuration
MODEL_PATH = os.environ.get("MODEL_DIR", "/app/models") + "/" + os.environ.get("MODEL_FILE", "Phi-4-mini-instruct-Q4_K_M.gguf")
N_CTX = int(os.environ.get("N_CTX", "8192"))
N_THREADS = int(os.environ.get("N_THREADS", "2"))
# Global model instance
llm = None
@asynccontextmanager
async def lifespan(app: FastAPI):
global llm
print(f"Loading model from: {MODEL_PATH}")
print(f"Context length: {N_CTX}, Threads: {N_THREADS}")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=N_CTX,
n_threads=N_THREADS,
verbose=False
)
print("Model loaded successfully!")
yield
print("Shutting down...")
app = FastAPI(
title="Phi-4-mini-instruct API",
description="OpenAI-compatible API for Phi-4-mini-instruct with tool calling support",
version="1.0.0",
lifespan=lifespan
)
# Pydantic models for OpenAI compatibility
class ChatMessage(BaseModel):
role: str
content: Optional[str] = None
name: Optional[str] = None
tool_calls: Optional[List[Dict]] = None
tool_call_id: Optional[str] = None
class ToolFunction(BaseModel):
name: str
description: Optional[str] = ""
parameters: Optional[Dict] = {}
class Tool(BaseModel):
type: str = "function"
function: ToolFunction
class ChatCompletionRequest(BaseModel):
model: str = "phi-4-mini"
messages: List[ChatMessage]
tools: Optional[List[Tool]] = None
tool_choice: Optional[Union[str, Dict]] = "auto"
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = 2048
stream: Optional[bool] = False
class ModelInfo(BaseModel):
id: str
object: str = "model"
created: int
owned_by: str = "microsoft"
@app.get("/health")
async def health_check():
return {"status": "healthy", "model_loaded": llm is not None}
@app.get("/v1/models")
async def list_models():
return {
"object": "list",
"data": [
{
"id": "phi-4-mini",
"object": "model",
"created": int(time.time()),
"owned_by": "microsoft"
}
]
}
@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
if llm is None:
raise HTTPException(status_code=503, detail="Model not loaded")
# Convert messages to llama.cpp format
messages = []
for msg in request.messages:
message = {"role": msg.role, "content": msg.content or ""}
if msg.tool_calls:
message["tool_calls"] = msg.tool_calls
if msg.tool_call_id:
message["tool_call_id"] = msg.tool_call_id
messages.append(message)
# Prepare tools if provided
tools = None
if request.tools:
tools = [t.model_dump() for t in request.tools]
try:
response = llm.create_chat_completion(
messages=messages,
tools=tools,
tool_choice=request.tool_choice if tools else None,
temperature=request.temperature,
max_tokens=request.max_tokens,
stream=request.stream
)
if request.stream:
async def generate():
for chunk in response:
yield f"data: {json.dumps(chunk)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
return JSONResponse(content=response)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/v1/completions")
async def completions(request: dict):
if llm is None:
raise HTTPException(status_code=503, detail="Model not loaded")
prompt = request.get("prompt", "")
max_tokens = request.get("max_tokens", 2048)
temperature = request.get("temperature", 0.7)
stream = request.get("stream", False)
try:
response = llm(
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature,
stream=stream
)
if stream:
async def generate():
for chunk in response:
yield f"data: {json.dumps(chunk)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
return JSONResponse(content=response)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
uvicorn.run(
app,
host=os.environ.get("HOST", "0.0.0.0"),
port=int(os.environ.get("PORT", 7860))
)
PYTHON_SERVER