File size: 4,245 Bytes
5bc0d7f
 
 
 
cb329ff
5bc0d7f
 
 
 
 
cb329ff
5bc0d7f
c346a26
cb329ff
5bc0d7f
 
 
 
 
c346a26
5bc0d7f
 
4c541ae
cb329ff
4c541ae
 
 
5bc0d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb329ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bc0d7f
 
cb329ff
 
 
 
 
 
 
 
 
5bc0d7f
 
1e56efd
5bc0d7f
1e56efd
 
5bc0d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e56efd
 
 
5bc0d7f
 
 
67997f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python

import os
from threading import Thread
from queue import Queue, Empty
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

DESCRIPTION = "# Sakalti/anchobi-4b"
DESCRIPTION += "\n<p>現在の環境に合わせて最適化されています。</p>"

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "32768"))

model_id = "Sakalti/anchobi-4b"
if torch.cuda.is_available():
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
else:
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_id)


def apply_chat_template(conversation: list[dict[str, str]]) -> str:
    prompt = "\n".join([f"{c['role']}: {c['content']}" for c in conversation])
    prompt = f"{prompt}\nASSISTANT: "
    return prompt


@torch.inference_mode()
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
    temperature: float = 0.7,
    top_p: float = 0.95,
    top_k: int = 50,
    repetition_penalty: float = 1.0,
) -> Iterator[str]:
    conversation = []
    for user, assistant in chat_history:
        conversation.extend([{"role": "USER", "content": user}, {"role": "ASSISTANT", "content": assistant}])
    conversation.append({"role": "USER", "content": message})

    prompt = apply_chat_template(conversation)
    input_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    output_queue = Queue()
    def inference():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
        )
        for token in tokenizer.decode(outputs[0], skip_special_tokens=True).split():
            output_queue.put(token)
        output_queue.put(None)  # 終了シグナル

    Thread(target=inference).start()

    outputs = []
    while True:
        try:
            token = output_queue.get(timeout=20.0)  # タイムアウト設定
            if token is None:
                break
            outputs.append(token)
            yield "".join(outputs)
        except Empty:
            yield "現在応答を生成中です。しばらくお待ちください。"


demo = gr.ChatInterface(
    fn=generate,
    type="tuples",
    additional_inputs_accordion=gr.Accordion(label="詳細設定", open=False),
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.7,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.95,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.0,
        ),
    ],
    stop_btn=None,
    examples=[
        ["東京の観光名所を教えて。"],
        ["落武者って何?"],
        ["暴れん坊将軍って誰のこと?"],
        ["人がヘリを食べるのにかかる時間は?"],
    ],
    description=DESCRIPTION,
    css_paths="style.css",
    fill_height=True,
)

if __name__ == "__main__":
    demo.launch()