Spaces:
Sleeping
Sleeping
File size: 4,245 Bytes
5bc0d7f cb329ff 5bc0d7f cb329ff 5bc0d7f c346a26 cb329ff 5bc0d7f c346a26 5bc0d7f 4c541ae cb329ff 4c541ae 5bc0d7f cb329ff 5bc0d7f cb329ff 5bc0d7f 1e56efd 5bc0d7f 1e56efd 5bc0d7f 1e56efd 5bc0d7f 67997f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#!/usr/bin/env python
import os
from threading import Thread
from queue import Queue, Empty
from typing import Iterator
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
DESCRIPTION = "# Sakalti/anchobi-4b"
DESCRIPTION += "\n<p>現在の環境に合わせて最適化されています。</p>"
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "32768"))
model_id = "Sakalti/anchobi-4b"
if torch.cuda.is_available():
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
else:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_id)
def apply_chat_template(conversation: list[dict[str, str]]) -> str:
prompt = "\n".join([f"{c['role']}: {c['content']}" for c in conversation])
prompt = f"{prompt}\nASSISTANT: "
return prompt
@torch.inference_mode()
def generate(
message: str,
chat_history: list[tuple[str, str]],
max_new_tokens: int = 1024,
temperature: float = 0.7,
top_p: float = 0.95,
top_k: int = 50,
repetition_penalty: float = 1.0,
) -> Iterator[str]:
conversation = []
for user, assistant in chat_history:
conversation.extend([{"role": "USER", "content": user}, {"role": "ASSISTANT", "content": assistant}])
conversation.append({"role": "USER", "content": message})
prompt = apply_chat_template(conversation)
input_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
output_queue = Queue()
def inference():
outputs = model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
pad_token_id=tokenizer.eos_token_id,
)
for token in tokenizer.decode(outputs[0], skip_special_tokens=True).split():
output_queue.put(token)
output_queue.put(None) # 終了シグナル
Thread(target=inference).start()
outputs = []
while True:
try:
token = output_queue.get(timeout=20.0) # タイムアウト設定
if token is None:
break
outputs.append(token)
yield "".join(outputs)
except Empty:
yield "現在応答を生成中です。しばらくお待ちください。"
demo = gr.ChatInterface(
fn=generate,
type="tuples",
additional_inputs_accordion=gr.Accordion(label="詳細設定", open=False),
additional_inputs=[
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
),
gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.7,
),
gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.95,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1.0,
),
],
stop_btn=None,
examples=[
["東京の観光名所を教えて。"],
["落武者って何?"],
["暴れん坊将軍って誰のこと?"],
["人がヘリを食べるのにかかる時間は?"],
],
description=DESCRIPTION,
css_paths="style.css",
fill_height=True,
)
if __name__ == "__main__":
demo.launch() |