|
import os |
|
import subprocess |
|
import time |
|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
from optimum.intel.openvino import OVModelForCausalLM |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "Qwen/Qwen3-0.6B" |
|
MODEL_DIR = "Qwen3-0.6B-int4-ov" |
|
|
|
|
|
EXPORT_CMD = [ |
|
"optimum-cli", "export", "openvino", |
|
"--model", MODEL_NAME, |
|
"--task", "text-generation-with-past", |
|
"--weight-format", "int4", |
|
"--sym", |
|
"--group-size", "-1", |
|
MODEL_DIR, |
|
"--backup-precision", "int8_sym" |
|
] |
|
|
|
if not os.path.exists(MODEL_DIR): |
|
print("🔍 模型目錄不存在,開始自動匯出模型... this may take several minutes") |
|
try: |
|
subprocess.run(EXPORT_CMD, check=True) |
|
print("✅ 模型匯出完成") |
|
except subprocess.CalledProcessError as e: |
|
print(f"❌ 模型匯出失敗: {e}") |
|
raise SystemExit("請檢查網路連線或磁碟空間") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) |
|
model = OVModelForCausalLM.from_pretrained(MODEL_DIR, device="CPU", trust_remote_code=True) |
|
|
|
|
|
def generate_response(prompt: str): |
|
messages = [{"role": "user", "content": prompt}] |
|
text = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True, |
|
enable_thinking=True |
|
) |
|
|
|
inputs = tokenizer([text], return_tensors="pt") |
|
|
|
|
|
start_time = time.time() |
|
generated = model.generate(**inputs, max_new_tokens=512) |
|
inference_time = time.time() - start_time |
|
|
|
|
|
output_ids = generated[0][len(inputs["input_ids"][0]):].tolist() |
|
num_tokens = len(output_ids) |
|
|
|
|
|
try: |
|
idx = len(output_ids) - output_ids[::-1].index(151668) |
|
except ValueError: |
|
idx = 0 |
|
|
|
thinking = tokenizer.decode(output_ids[:idx], skip_special_tokens=True).strip("\n") |
|
content = tokenizer.decode(output_ids[idx:], skip_special_tokens=True).strip("\n") |
|
|
|
|
|
if inference_time > 0: |
|
tokens_per_sec = num_tokens / inference_time |
|
speed_str = f"{tokens_per_sec:.2f} tokens/sec" |
|
else: |
|
speed_str = "推理時間過短,無法計算速度" |
|
|
|
return speed_str, thinking, content |
|
|
|
|
|
demo = gr.Interface( |
|
fn=generate_response, |
|
inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"), |
|
outputs=[ |
|
gr.Textbox(label="tokens/sec"), |
|
gr.Textbox(label="思考過程"), |
|
gr.Textbox(label="最終回應") |
|
], |
|
title="Qwen3-0.6B OpenVINO + Gradio", |
|
description="基於 OpenVINO 最佳化的 Qwen3-0.6B 推理應用,支援思考過程分離與 GUI。" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |