from fastapi import FastAPI, Request from transformers import AutoTokenizer, AutoModelForCasualLM app = FastAPI() tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B") model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B", torch_dtype=torch.bfloat16) @app.get("/gpt") async def gpt(prompt: str, req: Request): inputs = tokenizer(": Hello!\n:", return_tensors='pt').to(model.device) outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8) output_str = tokenizer.decode(outputs[0]) print(output_str) return {"response": output_str}