hsuwill000's picture
Update app.py
68f4b64 verified
import os
import subprocess
import time
import gradio as gr
from transformers import AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
# ========== 安裝與匯出(首次使用時執行) ==========
# 如果還沒安裝,可手動執行下列指令:
# pip install --pre -U openvino-genai openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
# pip install nncf\# pip install git+https://github.com/openvino-dev-samples/optimum-intel.git@2aebd4441023d3c003b27c87fff5312254ac
# pip install "transformers>=4.51.3"
MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_DIR = "Qwen3-0.6B-int4-ov"
# Optimum-CLI 匯出指令,包含 nf4 量化與 int8_sym 備援精度
EXPORT_CMD = [
"optimum-cli", "export", "openvino",
"--model", MODEL_NAME,
"--task", "text-generation-with-past",
"--weight-format", "int4",
"--sym",
"--group-size", "-1",
MODEL_DIR,
"--backup-precision", "int8_sym"
]
if not os.path.exists(MODEL_DIR):
print("🔍 模型目錄不存在,開始自動匯出模型... this may take several minutes")
try:
subprocess.run(EXPORT_CMD, check=True)
print("✅ 模型匯出完成")
except subprocess.CalledProcessError as e:
print(f"❌ 模型匯出失敗: {e}")
raise SystemExit("請檢查網路連線或磁碟空間")
# 載入 tokenizer & 模型
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = OVModelForCausalLM.from_pretrained(MODEL_DIR, device="CPU", trust_remote_code=True)
# 生成回應函數
def generate_response(prompt: str):
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=True
)
inputs = tokenizer([text], return_tensors="pt")
# 推理開始計時
start_time = time.time()
generated = model.generate(**inputs, max_new_tokens=512)
inference_time = time.time() - start_time
# 計算生成的 token 數
output_ids = generated[0][len(inputs["input_ids"][0]):].tolist()
num_tokens = len(output_ids)
# 嘗試分割思考 / 回應
try:
idx = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
idx = 0
thinking = tokenizer.decode(output_ids[:idx], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[idx:], skip_special_tokens=True).strip("\n")
# 計算 tokens/sec
if inference_time > 0:
tokens_per_sec = num_tokens / inference_time
speed_str = f"{tokens_per_sec:.2f} tokens/sec"
else:
speed_str = "推理時間過短,無法計算速度"
return speed_str, thinking, content
# 建立 Gradio 介面
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
outputs=[
gr.Textbox(label="tokens/sec"),
gr.Textbox(label="思考過程"),
gr.Textbox(label="最終回應")
],
title="Qwen3-0.6B OpenVINO + Gradio",
description="基於 OpenVINO 最佳化的 Qwen3-0.6B 推理應用,支援思考過程分離與 GUI。"
)
if __name__ == "__main__":
demo.launch()