|
from fastapi import FastAPI, Request |
|
from transformers import AutoTokenizer, AutoModelForCasualLM |
|
|
|
app = FastAPI() |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B") |
|
model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B", torch_dtype=torch.bfloat16) |
|
|
|
@app.get("/gpt") |
|
async def gpt(prompt: str, req: Request): |
|
inputs = tokenizer("<human>: Hello!\n<bot>:", return_tensors='pt').to(model.device) |
|
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8) |
|
output_str = tokenizer.decode(outputs[0]) |
|
print(output_str) |
|
return {"response": output_str} |