from fastapi import FastAPI, HTTPException from pydantic import BaseModel # Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM app = FastAPI() tokenizer = AutoTokenizer.from_pretrained("Intel/neural-chat-7b-v3-1") model = AutoModelForCausalLM.from_pretrained("Intel/neural-chat-7b-v3-1") class ChatInput(BaseModel): system_input: str user_input: str @app.post("/generate-response") async def generate_response(chat_input: ChatInput): try: # Format the input using the provided template prompt = f"### System:\n{chat_input.system_input}\n### User:\n{chat_input.user_input}\n### Assistant:\n" # Tokenize and encode the prompt inputs = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False) # Generate a response outputs = model.generate(inputs, max_length=1000, num_return_sequences=1) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the assistant's response return {"response": response.split("### Assistant:\n")[-1]} except Exception as e: raise HTTPException(status_code=500, detail=str(e))