Spaces:
Paused
Paused
from fastapi import FastAPI, HTTPException | |
from pydantic import BaseModel | |
# Load model directly | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
app = FastAPI() | |
tokenizer = AutoTokenizer.from_pretrained("Intel/neural-chat-7b-v3") | |
model = AutoModelForCausalLM.from_pretrained("Intel/neural-chat-7b-v3") | |
class ChatInput(BaseModel): | |
system_input: str | |
user_input: str | |
async def generate_response(chat_input: ChatInput): | |
try: | |
# Format the input using the provided template | |
prompt = f"### System:\n{chat_input.system_input}\n### User:\n{chat_input.user_input}\n### Assistant:\n" | |
# Tokenize and encode the prompt | |
inputs = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False) | |
# Generate a response | |
outputs = model.generate(inputs, max_length=1000, num_return_sequences=1) | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Extract only the assistant's response | |
return {"response": response.split("### Assistant:\n")[-1]} | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |