Spaces:
Sleeping
Sleeping
| import os | |
| from fastapi import FastAPI, Request | |
| from fastapi.responses import StreamingResponse | |
| from huggingface_hub import InferenceClient | |
| import json | |
| import asyncio | |
| app = FastAPI() | |
| # Get your token from Hugging Face Secrets (Settings > Secrets) | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Model choice (e.g., "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct") | |
| MODEL_ID = "Qwen/Qwen2.5-Coder-32B-Instruct" #"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" | |
| client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) | |
| def health_check(): | |
| return {"status": "Agent Active", "model": MODEL_ID} | |
| async def chat_completions(request: Request): | |
| body = await request.json() | |
| messages = body.get("messages", []) | |
| stream = body.get("stream", False) | |
| if stream: | |
| return StreamingResponse( | |
| stream_generator(messages), | |
| media_type="text/event-stream" | |
| ) | |
| else: | |
| # Standard non-streaming response | |
| response = client.chat_completion( | |
| messages=messages, | |
| max_tokens=body.get("max_tokens", 1024), | |
| temperature=body.get("temperature", 0.7), | |
| ) | |
| return response | |
| async def stream_generator(messages): | |
| """Generates an OpenAI-compatible SSE stream""" | |
| for chunk in client.chat_completion( | |
| messages=messages, | |
| max_tokens=2048, | |
| stream=True, | |
| ): | |
| # Format the chunk to look like OpenAI's wire format | |
| data = { | |
| "id": "chatcmpl-custom", | |
| "object": "chat.completion.chunk", | |
| "choices": [{ | |
| "delta": {"content": chunk.choices[0].delta.content}, | |
| "finish_reason": chunk.choices[0].finish_reason, | |
| "index": 0 | |
| }] | |
| } | |
| yield f"data: {json.dumps(data)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |