File size: 3,257 Bytes
6f98787 2915514 6f98787 7cc92ff 486d488 deffb99 7cc92ff 6f98787 83bb258 2915514 7cc92ff 6f98787 83bb258 6f98787 83bb258 2915514 83bb258 6f98787 3bde936 6f98787 7cc92ff 6f98787 2915514 6f98787 7cc92ff ee1ca9e 7cc92ff 6f98787 7cc92ff ee1ca9e 7cc92ff 2915514 7cc92ff 68f4b64 7cc92ff c116177 7cc92ff ee1ca9e 7cc92ff 65f515d 83bb258 deffb99 7cc92ff 83bb258 7cc92ff 0df4668 7cc92ff deffb99 7cc92ff 6f98787 2915514 6f98787 deffb99 6f98787 7cc92ff 6f98787 3486524 4d871c7 7cc92ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
import subprocess
import time
import gradio as gr
from transformers import AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
# ========== 安裝與匯出(首次使用時執行) ==========
# 如果還沒安裝,可手動執行下列指令:
# pip install --pre -U openvino-genai openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
# pip install nncf\# pip install git+https://github.com/openvino-dev-samples/optimum-intel.git@2aebd4441023d3c003b27c87fff5312254ac
# pip install "transformers>=4.51.3"
MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_DIR = "Qwen3-0.6B-int4-ov"
# Optimum-CLI 匯出指令,包含 nf4 量化與 int8_sym 備援精度
EXPORT_CMD = [
"optimum-cli", "export", "openvino",
"--model", MODEL_NAME,
"--task", "text-generation-with-past",
"--weight-format", "int4",
"--sym",
"--group-size", "-1",
MODEL_DIR,
"--backup-precision", "int8_sym"
]
if not os.path.exists(MODEL_DIR):
print("🔍 模型目錄不存在,開始自動匯出模型... this may take several minutes")
try:
subprocess.run(EXPORT_CMD, check=True)
print("✅ 模型匯出完成")
except subprocess.CalledProcessError as e:
print(f"❌ 模型匯出失敗: {e}")
raise SystemExit("請檢查網路連線或磁碟空間")
# 載入 tokenizer & 模型
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = OVModelForCausalLM.from_pretrained(MODEL_DIR, device="CPU", trust_remote_code=True)
# 生成回應函數
def generate_response(prompt: str):
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=True
)
inputs = tokenizer([text], return_tensors="pt")
# 推理開始計時
start_time = time.time()
generated = model.generate(**inputs, max_new_tokens=512)
inference_time = time.time() - start_time
# 計算生成的 token 數
output_ids = generated[0][len(inputs["input_ids"][0]):].tolist()
num_tokens = len(output_ids)
# 嘗試分割思考 / 回應
try:
idx = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
idx = 0
thinking = tokenizer.decode(output_ids[:idx], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[idx:], skip_special_tokens=True).strip("\n")
# 計算 tokens/sec
if inference_time > 0:
tokens_per_sec = num_tokens / inference_time
speed_str = f"{tokens_per_sec:.2f} tokens/sec"
else:
speed_str = "推理時間過短,無法計算速度"
return speed_str, thinking, content
# 建立 Gradio 介面
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="思考過程"),
gr.Textbox(label="最終回應")
],
title="Qwen3-0.6B OpenVINO + Gradio",
description="基於 OpenVINO 最佳化的 Qwen3-0.6B 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch() |