audio2text / whisper_stt.py
Ander1's picture
Upload 11 files
4903e5a verified
import whisper
import logging
from typing import Optional, Dict, Any
import torch
# 設定日誌
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def transcribe_audio_whisper(
file_path: str,
model_name: str = "base",
language: Optional[str] = None,
initial_prompt: Optional[str] = None,
task: str = "transcribe"
) -> Optional[Dict[str, Any]]:
"""
使用 Whisper 模型進行音訊轉文字
Args:
file_path: 音訊檔案路徑
model_name: Whisper 模型名稱 ("tiny", "base", "small", "medium", "large")
language: 音訊語言(ISO 639-1 代碼,如 "zh" 表示中文)
initial_prompt: 初始提示詞
task: 任務類型 ("transcribe" 或 "translate")
Returns:
包含轉錄結果的字典,如果失敗則返回 None
"""
try:
# 檢查 CUDA 是否可用
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"使用設備: {device}")
# 載入模型
logger.info(f"載入 Whisper {model_name} 模型...")
model = whisper.load_model(model_name, device=device)
# 轉錄選項
options = {
"task": task,
"verbose": True
}
if language:
options["language"] = language
if initial_prompt:
options["initial_prompt"] = initial_prompt
# 執行轉錄
logger.info("開始轉錄...")
result = model.transcribe(file_path, **options)
# 整理結果
response = {
"text": result["text"],
"language": result.get("language", "unknown"),
"segments": result.get("segments", [])
}
logger.info("轉錄完成")
return response
except Exception as e:
logger.error(f"轉錄失敗:{str(e)}")
return None
def get_available_models() -> list:
"""
取得可用的 Whisper 模型列表
"""
return ["tiny", "base", "small", "medium", "large"]
def get_model_description(model_name: str) -> str:
"""
取得模型描述
"""
descriptions = {
"tiny": "最小的模型,速度最快但準確度較低",
"base": "基礎模型,平衡速度和準確度",
"small": "小型模型,準確度較好",
"medium": "中型模型,準確度高",
"large": "最大的模型,準確度最高但需要較多資源"
}
return descriptions.get(model_name, "未知模型")