from threading import Thread import gradio as gr import inspect from gradio import routes from typing import List, Type from petals import AutoDistributedModelForCausalLM from transformers import AutoTokenizer import npc_data import requests, os, re, asyncio, json loop = asyncio.get_event_loop() # init code def get_types(cls_set: List[Type], component: str): docset = [] types = [] if component == "input": for cls in cls_set: doc = inspect.getdoc(cls) doc_lines = doc.split("\n") docset.append(doc_lines[1].split(":")[-1]) types.append(doc_lines[1].split(")")[0].split("(")[-1]) else: for cls in cls_set: doc = inspect.getdoc(cls) doc_lines = doc.split("\n") docset.append(doc_lines[-1].split(":")[-1]) types.append(doc_lines[-1].split(")")[0].split("(")[-1]) return docset, types routes.get_types = get_types # App code model_name = "daekeun-ml/Llama-2-ko-instruct-13B" #daekeun-ml/Llama-2-ko-instruct-13B #quantumaikr/llama-2-70b-fb16-korean tokenizer = AutoTokenizer.from_pretrained(model_name) def init(): if check: model = AutoDistributedModelForCausalLM.from_pretrained(model_name) def check(model_name): data = requests.get("https://health.petals.dev/api/v1/state").json() out = [] for d in data['model_reports']: if d['name'] == model_name: if d['state']=="healthy": return True return False def chat(id, npc, prompt): # get_coin endpoint response = requests.post("https://ldhldh-api-for-unity.hf.space/run/predict_6", json={ "data": [ id, ]}).json() coin = response["data"][0] if int(coin) == 0: return "no coin" # model inference init() if check: prom = "" inputs = tokenizer(prom, return_tensors="pt")["input_ids"] outputs = model.generate(inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0])) else: output = "no model" # add_transaction endpoint response = requests.post("https://ldhldh-api-for-unity.hf.space/run/predict_5", json={ "data": [ id, "inference", "### input:\n" + prompt + "\n\n### output:\n" + output ]}).json() d = response["data"][0] return output with gr.Blocks() as demo: count = 0 aa = gr.Interface( fn=chat, inputs=["text","text","text"], outputs="text", description="chat, ai 응답을 반환합니다. 내부적으로 트랜잭션 생성. \n /run/predict", ) demo.queue(max_size=32).launch(enable_queue=True)