Spaces:
Paused
Paused
File size: 1,211 Bytes
d22b8ea 3ff3333 ec39c74 d22b8ea f28ffbf d22b8ea affe825 d22b8ea affe825 d22b8ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
app = FastAPI()
tokenizer = AutoTokenizer.from_pretrained("Intel/neural-chat-7b-v3-1")
model = AutoModelForCausalLM.from_pretrained("Intel/neural-chat-7b-v3-1")
class ChatInput(BaseModel):
system_input: str
user_input: str
@app.post("/generate-response")
async def generate_response(chat_input: ChatInput):
try:
# Format the input using the provided template
prompt = f"### System:\n{chat_input.system_input}\n### User:\n{chat_input.user_input}\n### Assistant:\n"
# Tokenize and encode the prompt
inputs = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False)
# Generate a response
outputs = model.generate(inputs, max_length=1000, num_return_sequences=1, pad_token_id=tokenizer.eos_token)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's response
return {"response": response.split("### Assistant:\n")[-1]}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
|