VikranthBhat's picture
Update app.py
9fe70b7 verified
import os
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from huggingface_hub import InferenceClient
import json
import asyncio
app = FastAPI()
# Get your token from Hugging Face Secrets (Settings > Secrets)
HF_TOKEN = os.getenv("HF_TOKEN")
# Model choice (e.g., "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct")
MODEL_ID = "Qwen/Qwen2.5-Coder-32B-Instruct" #"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
@app.get("/")
def health_check():
return {"status": "Agent Active", "model": MODEL_ID}
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
body = await request.json()
messages = body.get("messages", [])
stream = body.get("stream", False)
if stream:
return StreamingResponse(
stream_generator(messages),
media_type="text/event-stream"
)
else:
# Standard non-streaming response
response = client.chat_completion(
messages=messages,
max_tokens=body.get("max_tokens", 1024),
temperature=body.get("temperature", 0.7),
)
return response
async def stream_generator(messages):
"""Generates an OpenAI-compatible SSE stream"""
for chunk in client.chat_completion(
messages=messages,
max_tokens=2048,
stream=True,
):
# Format the chunk to look like OpenAI's wire format
data = {
"id": "chatcmpl-custom",
"object": "chat.completion.chunk",
"choices": [{
"delta": {"content": chunk.choices[0].delta.content},
"finish_reason": chunk.choices[0].finish_reason,
"index": 0
}]
}
yield f"data: {json.dumps(data)}\n\n"
yield "data: [DONE]\n\n"
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)