Qwen3_0.6B_openvino_cpu

Running

File size: 3,257 Bytes

6f98787
2915514
 
 
6f98787
7cc92ff
486d488
deffb99
7cc92ff
 
 
 
 
6f98787
83bb258
2915514
7cc92ff
6f98787
 
 
 
83bb258
6f98787
83bb258
2915514
83bb258
6f98787
3bde936
6f98787
7cc92ff
6f98787
 
2915514
6f98787
7cc92ff
 
ee1ca9e
7cc92ff
6f98787
7cc92ff
ee1ca9e
7cc92ff
2915514
7cc92ff
 
 
 
 
68f4b64
7cc92ff
c116177
7cc92ff
ee1ca9e
7cc92ff
 
65f515d
83bb258
deffb99
7cc92ff
 
 
83bb258
7cc92ff
 
 
 
 
 
0df4668
 
 
7cc92ff
 
 
 
 
 
deffb99
7cc92ff
 
 
6f98787
 
2915514
6f98787
deffb99
6f98787
 
 
7cc92ff
 
6f98787
3486524
4d871c7
7cc92ff

import os
import subprocess
import time
import gradio as gr
from transformers import AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM

# ========== 安裝與匯出（首次使用時執行） ==========
# 如果還沒安裝，可手動執行下列指令：
# pip install --pre -U openvino-genai openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
# pip install nncf\# pip install git+https://github.com/openvino-dev-samples/optimum-intel.git@2aebd4441023d3c003b27c87fff5312254ac
# pip install "transformers>=4.51.3"

MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_DIR = "Qwen3-0.6B-int4-ov"

# Optimum-CLI 匯出指令，包含 nf4 量化與 int8_sym 備援精度
EXPORT_CMD = [
    "optimum-cli", "export", "openvino",
    "--model", MODEL_NAME,
    "--task", "text-generation-with-past",
    "--weight-format", "int4",
    "--sym",
    "--group-size", "-1",
    MODEL_DIR,
    "--backup-precision", "int8_sym"
]

if not os.path.exists(MODEL_DIR):
    print("🔍 模型目錄不存在，開始自動匯出模型... this may take several minutes")
    try:
        subprocess.run(EXPORT_CMD, check=True)
        print("✅ 模型匯出完成")
    except subprocess.CalledProcessError as e:
        print(f"❌ 模型匯出失敗: {e}")
        raise SystemExit("請檢查網路連線或磁碟空間")

# 載入 tokenizer & 模型
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = OVModelForCausalLM.from_pretrained(MODEL_DIR, device="CPU", trust_remote_code=True)

# 生成回應函數
def generate_response(prompt: str):
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True
    )

    inputs = tokenizer([text], return_tensors="pt")

    # 推理開始計時
    start_time = time.time()
    generated = model.generate(**inputs, max_new_tokens=512)
    inference_time = time.time() - start_time

    # 計算生成的 token 數
    output_ids = generated[0][len(inputs["input_ids"][0]):].tolist()
    num_tokens = len(output_ids)

    # 嘗試分割思考 / 回應
    try:
        idx = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        idx = 0

    thinking = tokenizer.decode(output_ids[:idx], skip_special_tokens=True).strip("\n")
    content = tokenizer.decode(output_ids[idx:], skip_special_tokens=True).strip("\n")
    
    # 計算 tokens/sec
    if inference_time > 0:
        tokens_per_sec = num_tokens / inference_time
        speed_str = f"{tokens_per_sec:.2f} tokens/sec"
    else:
        speed_str = "推理時間過短，無法計算速度"

    return speed_str, thinking, content
    
# 建立 Gradio 介面
demo = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=5, label="輸入提示 (Prompt)"),
    outputs=[
        gr.Textbox(label="tokens/sec"),
        gr.Textbox(label="思考過程"),
        gr.Textbox(label="最終回應")
    ],
    title="Qwen3-0.6B OpenVINO + Gradio",
    description="基於 OpenVINO 最佳化的 Qwen3-0.6B 推理應用，支援思考過程分離與 GUI。"
)

if __name__ == "__main__":
    demo.launch()