import torch import random from quart import Quart from transformers import AutoTokenizer, AutoModelForCausalLM app = Quart(__name__) tokenizer = AutoTokenizer.from_pretrained("OpenBuddy/openbuddy-openllama-3b-v10-bf16") model = AutoModelForCausalLM.from_pretrained("OpenBuddy/openbuddy-openllama-3b-v10-bf16") model.eval() with open('system.prompt', 'r', encoding='utf-8') as f: prompt = f.read() @app.post("/request") async def echo(): data = await request.get_json() if data.get("max_tokens") != None and data.get("max_tokens") > 500: data['max_tokens'] = 500 userPrompt = prompt + "\n\nUser: " + data['request'] + "\nAssistant: " input_ids = tokenizer.encode(userPrompt, return_tensors='pt') with torch.no_grad(): output_ids = model.generate( input_ids=input_ids, do_sample=random.choice([True, False]), temperature=float(random.randint(7,20)) / 10.0, max_new_tokens=data.get("max_tokens") or random.randomint(200,500), eos_token_id=tokenizer.eos_token_id, return_full_text = False) output = tokenizer.decode(output_ids[0], skip_special_tokens=True) return {"output": output} @app.get("/") async def get(): return "better to run it on own container"