from fastapi import FastAPI from pydantic import BaseModel from typing import List from transformers import AutoModel, AutoTokenizer import torch app = FastAPI() MODEL_NAME = "deepseek-ai/DeepSeek-R1-0528" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME).eval() if torch.cuda.is_available(): model = model.cuda() class Message(BaseModel): role: str # "user" or "assistant" content: str class ChatRequest(BaseModel): messages: List[Message] @app.post("/chat") async def chat_endpoint(request: ChatRequest): # Build the prompt with conversation history input_text = "" for msg in request.messages: if msg.role == "user": input_text += f"User: {msg.content}\n" elif msg.role == "assistant": input_text += f"Assistant: {msg.content}\n" input_text += "Assistant:" inputs = tokenizer(input_text, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.cuda() for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate(**inputs, max_length=200) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant reply only reply = response.split("Assistant:")[-1].strip() return {"reply": reply}