"""Model hosted on Hugging face. Based on: https://huggingface.co/docs/hub/spaces-sdks-docker-first-demo """ from fastapi import FastAPI, Request from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from transformers import T5Tokenizer, T5ForConditionalGeneration token_size_limit = None # FROM: https://huggingface.co/facebook/blenderbot-400M-distill?text=Hey+my+name+is+Thomas%21+How+are+you%3F # tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") # model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill") # tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-1B-distill") # model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-1B-distill") # token_size_limit = 128 # T5 model can use "any" sequence lenghth, but memory usage is O(L^2). # tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small") # model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small") # tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") # model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large") model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large") token_size_limit = 512 # Too large for 16GB # tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl") # model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl") app = FastAPI() @app.post('/reply') async def Reply(req: Request): request = await req.json() msg = request['msg'] print(f'MSG: {msg}') input_ids = tokenizer(msg, return_tensors='pt').input_ids # .to('cuda') output = model.generate( input_ids[:, -token_size_limit:], do_sample=True, temperature=0.9, max_length=100, ) reply = tokenizer.batch_decode(output)[0] print(f'REPLY: {reply}') return {'reply': reply} @app.get("/") def read_root(): return {"Hello": "World!"}