from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer import torch app = FastAPI() # Charger le modèle et le tokenizer #model_name = "mistralai/Mistral-7B-Instruct-v0.1" # Modèle Mistral 7B #model_name = "HuggingFaceH4/zephyr-3b" #model_name = "serkanarslan/mistral-7b-mini-ft" model_name = "microsoft/phi-2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" # Utilise le GPU si dispo ) # Définir le format des requêtes class ChatRequest(BaseModel): message: str @app.post("/chat") async def chat(request: ChatRequest): inputs = tokenizer(request.message, return_tensors="pt").to("cuda") output = model.generate(**inputs, max_length=100) response = tokenizer.decode(output[0], skip_special_tokens=True) return {"response": response}