import os os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" import jieba import torch import re import easyocr import io import numpy as np from PIL import Image from huggingface_hub import hf_hub_download from transformers import BertTokenizer from AI_Model_architecture import BertLSTM_CNN_Classifier from lime.lime_text import LimeTextExplainer # HuggingFace 完整快取路徑設定 os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface_cache" # EasyOCR 路徑設定 os.environ["EASYOCR_MODULE_PATH"] = "/tmp/easyocr" os.environ["HF_HOME"] = "/tmp/huggingface" os.environ["HF_DATASETS_CACHE"] = "/tmp/huggingface/datasets" os.environ["HF_TRANSFORMERS_CACHE"] = "/tmp/huggingface/transformers" HF_TOKEN = os.environ.get("HF_TOKEN") # OCR 模組 import os # 在目前專案資料夾下創建一個安全的模型資料夾 (保證有權限) # 建立一個統一的寫入安全資料夾 safe_dir = '/tmp/easyocr' # 確保目錄存在 os.makedirs(safe_dir, exist_ok=True) reader = easyocr.Reader( ['ch_tra', 'en'], gpu=torch.cuda.is_available(), model_storage_directory=safe_dir, # 下載模型放這裡 user_network_directory=safe_dir # 神經網路架構參數也放這裡 ) # 設定裝置(GPU 優先) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 載入模型與 tokenizer def load_model_and_tokenizer(): global model, tokenizer if os.path.exists("model.pth"): print("✅ 已找到 model.pth 載入模型") model_path = "model.pth" else: print("🚀 未找到 model.pth") model_path = hf_hub_download(repo_id="Bennie12/Bert-Lstm-Cnn-ScamDetecter", filename="model.pth", token=HF_TOKEN) model = BertLSTM_CNN_Classifier() model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) model.eval() tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese", use_fast=False) return model, tokenizer model, tokenizer = load_model_and_tokenizer() model.eval() # 預測單一句子的分類結果 def predict_single_sentence(model, tokenizer, sentence, max_len=256): sentence = re.sub(r"\s+", "", sentence) sentence = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。,!?:/._-]", "", sentence) encoded = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=max_len) encoded = {k: v.to(device) for k, v in encoded.items()} with torch.no_grad(): output = model(encoded["input_ids"], encoded["attention_mask"], encoded["token_type_ids"]) prob = torch.sigmoid(output).item() label = int(prob > 0.5) risk = "🟢 低風險(正常)" if prob > 0.9: risk = "🔴 高風險(極可能是詐騙)" elif prob > 0.5: risk = "🟡 中風險(可疑)" pre_label = '詐騙' if label == 1 else '正常' return { "label": pre_label, "prob": prob, "risk": risk } # 提供 LIME 用的 predict_proba def predict_proba(texts): # tokenizer 批次處理 encoded = tokenizer( texts, return_tensors="pt", padding=True, truncation=True, max_length=256 ) # 移動到 GPU 或 CPU encoded = {k: v.to(device) for k, v in encoded.items()} with torch.no_grad(): outputs = model(encoded["input_ids"], encoded["attention_mask"], encoded["token_type_ids"]) # outputs shape: (batch_size,) probs = torch.sigmoid(outputs).cpu().numpy() # 轉成 LIME 格式:(N, 2) probs_2d = np.vstack([1-probs, probs]).T return probs_2d # 初始化 LIME explainer class_names = ['正常', '詐騙'] lime_explainer = LimeTextExplainer(class_names=class_names) # 擷取可疑詞彙 (改用 LIME) def suspicious_tokens(text, explainer=lime_explainer, top_k=5): try: explanation = explainer.explain_instance(text, predict_proba, num_features=top_k, num_samples=200) keywords = [word for word, weight in explanation.as_list()] return keywords except Exception as e: print("⚠ LIME 失敗,啟用 fallback:", e) fallback = ["繳費", "終止", "逾期", "限時", "驗證碼"] return [kw for kw in fallback if kw in text] # 文字清理 def clean_text(text): text = re.sub(r"https?://\S+", "", text) text = re.sub(r"[a-zA-Z0-9:/.%\-_=+]{4,}", "", text) text = re.sub(r"\+?\d[\d\s\-]{5,}", "", text) text = re.sub(r"[^一-龥。,!?、]", "", text) sentences = re.split(r"[。!?]", text) cleaned = "。".join(sentences[:4]) return cleaned[:300] # 高亮顯示 def highlight_keywords(text, keywords, prob): """ 根據模型信心值 (prob) 動態決定螢光標註顏色, 並結合 jieba 斷詞,針對 LIME 輸出長片段進行子詞高亮標註。 """ if prob < 0.15: # 低風險完全不標註 return text # 決定標註顏色 if prob >= 0.65: css_class = 'red-highlight' else: css_class = 'yellow-highlight' # 對每個 keyword 進行 jieba 斷詞後標註 for phrase in keywords: for word in jieba.cut(phrase): word = word.strip() if len(word) >= 2 and word in text: text = text.replace(word, f"{word}") return text # 文字分析主流程 def analyze_text(text): cleaned_text = clean_text(text) result = predict_single_sentence(model, tokenizer, cleaned_text) label = result["label"] prob = result["prob"] risk = result["risk"] suspicious = suspicious_tokens(cleaned_text) # 依照可疑度做不同標註 highlighted_text = highlight_keywords(text, suspicious, prob) # 低風險下不回傳 suspicious_keywords if prob < 0.15: suspicious = [] print(f"\n📩 訊息內容:{text}") print(f"✅ 預測結果:{label}") print(f"📊 信心值:{round(prob*100, 2)}") print(f"⚠️ 風險等級:{risk}") print(f"可疑關鍵字擷取: {suspicious}") return { "status": label, "confidence": round(prob * 100, 2), "suspicious_keywords": suspicious, "highlighted_text": highlighted_text } # 圖片 OCR 分析 def analyze_image(file_bytes): image = Image.open(io.BytesIO(file_bytes)) image_np = np.array(image) results = reader.readtext(image_np) text = ' '.join([res[1] for res in results]).strip() if not text: return { "status" : "無法辨識文字", "confidence" : 0.0, "suspicious_keywords" : ["圖片中無可辨識的中文英文"], "highlighted_text": "無法辨識可疑內容" } return analyze_text(text)