scam-detectorv2

Sleeping

App Files Files Community

jerrynnms commited on Jun 5

Commit

f6b8f5d

verified ·

1 Parent(s): fc2fb82

Update bert_explainer.py

Browse files

Files changed (1) hide show

bert_explainer.py +134 -18

bert_explainer.py CHANGED Viewed

@@ -1,18 +1,25 @@
-import torch
-from transformers import BertTokenizer, BertModel
-from AI_Model_architecture import BertLSTM_CNN_Classifier
-import re
 import os
 import requests
 import jieba
-# ✅ 裝置設定
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ✅ 模型儲存與下載位置
 model_path = "/tmp/model.pth"
 model_url = "https://huggingface.co/jerrynnms/scam-model/resolve/main/model.pth"
 if not os.path.exists(model_path):
     print("📦 下載 model.pth 中...")
     response = requests.get(model_url)
@@ -23,31 +30,36 @@ if not os.path.exists(model_path):
     else:
         raise FileNotFoundError("❌ 無法下載 model.pth")
-# ✅ 初始化 tokenizer
 tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese")
-# ✅ 初始化自訂分類模型
 model = BertLSTM_CNN_Classifier()
 model.load_state_dict(torch.load(model_path, map_location=device))
 model.to(device)
 model.eval()
-# ✅ 初始化原始 BERT 模型供 attention 使用
 bert_model = BertModel.from_pretrained("ckiplab/bert-base-chinese", output_attentions=True)
 bert_model.to(device)
 bert_model.eval()
-# ✅ 預測單句文字
 def predict_single_sentence(text: str, max_len=256):
     text = re.sub(r"\s+", "", text)
     text = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。，！？:/.\-]", "", text)
-    encoded = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_len)
     input_ids = encoded["input_ids"].to(device)
     attention_mask = encoded["attention_mask"].to(device)
     token_type_ids = encoded["token_type_ids"].to(device)
     with torch.no_grad():
         output = model(input_ids, attention_mask, token_type_ids)
         prob = output.item()
@@ -55,24 +67,32 @@ def predict_single_sentence(text: str, max_len=256):
     return label, prob
-# ✅ 抽取高 attention token 並轉換為自然語意的詞句
 def extract_attention_keywords(text, top_k=5):
     cleaned = re.sub(r"\s+", "", text)
-    encoded = tokenizer(cleaned, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
     input_ids = encoded["input_ids"].to(device)
     attention_mask = encoded["attention_mask"].to(device)
     with torch.no_grad():
         outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
-        attentions = outputs.attentions
-    attn = attentions[-1][0].mean(dim=0).mean(dim=0)
     tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
     top_indices = attn.topk(top_k).indices.tolist()
     top_tokens = [tokens[i] for i in top_indices if tokens[i] not in ["[CLS]", "[SEP]", "[PAD]"]]
     words = list(jieba.cut(text))
     suspicious = []
     for word in words:
@@ -83,11 +103,22 @@ def extract_attention_keywords(text, top_k=5):
                 suspicious.append(word)
                 break
     return suspicious[:top_k] if suspicious else top_tokens[:top_k]
-# ✅ 封裝主 API 用的分析函數
 def analyze_text(text: str):
     label, prob = predict_single_sentence(text)
     prob_percent = round(prob * 100, 2)
     status = "詐騙" if label == 1 else "正常"
@@ -98,3 +129,88 @@ def analyze_text(text: str):
         "confidence": prob_percent,
         "suspicious_keywords": suspicious or ["（模型未聚焦可疑詞）"]
     }

 import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import io
+import re
 import requests
+import torch
 import jieba
+import numpy as np
+import cv2
+import easyocr
+from PIL import Image
+from transformers import BertTokenizer, BertModel
+from AI_Model_architecture import BertLSTM_CNN_Classifier
+# ─────────────────────────────────────────────────────────────────────────────
+# 1. Device 設定
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# 2. 下載並載入自訂分類模型（BertLSTM_CNN_Classifier）
 model_path = "/tmp/model.pth"
 model_url = "https://huggingface.co/jerrynnms/scam-model/resolve/main/model.pth"
 if not os.path.exists(model_path):
     print("📦 下載 model.pth 中...")
     response = requests.get(model_url)
     else:
         raise FileNotFoundError("❌ 無法下載 model.pth")
+# 3. 初始化 tokenizer 與自訂分類模型
 tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese")
 model = BertLSTM_CNN_Classifier()
 model.load_state_dict(torch.load(model_path, map_location=device))
 model.to(device)
 model.eval()
+# 4. 初始化原始 BERT 模型（供 attention 使用）
 bert_model = BertModel.from_pretrained("ckiplab/bert-base-chinese", output_attentions=True)
 bert_model.to(device)
 bert_model.eval()
+# ─────────────────────────────────────────────────────────────────────────────
+# ─────────────────────────────────────────────────────────────────────────────
+# 5. 預測單句文字函式
 def predict_single_sentence(text: str, max_len=256):
+    # 5.1. 簡單清洗：移除空白、保留中英文和部分標點
     text = re.sub(r"\s+", "", text)
     text = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。，！？:/.\-]", "", text)
+    # 5.2. Tokenize 並轉成 Tensor
+    encoded = tokenizer(text, return_tensors="pt", truncation=True,
+                        padding="max_length", max_length=max_len)
     input_ids = encoded["input_ids"].to(device)
     attention_mask = encoded["attention_mask"].to(device)
     token_type_ids = encoded["token_type_ids"].to(device)
+    # 5.3. 模型推論
     with torch.no_grad():
         output = model(input_ids, attention_mask, token_type_ids)
         prob = output.item()
     return label, prob
+# 6. 抽取高 attention token 並轉換為自然語意詞句
 def extract_attention_keywords(text, top_k=5):
+    # 6.1. 清洗文字（去除空白）
     cleaned = re.sub(r"\s+", "", text)
+    # 6.2. Tokenize 但只需要 attention，不需要分類模型
+    encoded = tokenizer(cleaned, return_tensors="pt", truncation=True,
+                        padding="max_length", max_length=128)
     input_ids = encoded["input_ids"].to(device)
     attention_mask = encoded["attention_mask"].to(device)
+    # 6.3. 將文字丟給原始 BERT 取最後一層 attention
     with torch.no_grad():
         outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
+        attentions = outputs.attentions  # tuple: 每層 transformer block 的 attention
+    # 6.4. 取最末層 attention，對所有 head、所有 token 均值 → 一維向量 (seq_len)
+    attn = attentions[-1][0].mean(dim=0).mean(dim=0)  # shape: (seq_len,)
+    # 6.5. 取得該句所有 token，排除特殊 token
     tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
     top_indices = attn.topk(top_k).indices.tolist()
     top_tokens = [tokens[i] for i in top_indices if tokens[i] not in ["[CLS]", "[SEP]", "[PAD]"]]
+    # 6.6. 用 jieba 切詞，將高 attention 的 token 映射回中文詞組
     words = list(jieba.cut(text))
     suspicious = []
     for word in words:
                 suspicious.append(word)
                 break
+    # 6.7. 回傳 top_k 個「可疑詞」；若都沒有映射出詞，就直接回 top_tokens
     return suspicious[:top_k] if suspicious else top_tokens[:top_k]
+# ─────────────────────────────────────────────────────────────────────────────
+# ─────────────────────────────────────────────────────────────────────────────
+# 7. 文字分析主函式：回傳完整結構
 def analyze_text(text: str):
+    """
+    輸入一段文字（純文字），回傳：
+    {
+      "status": "詐騙" / "正常",
+      "confidence": float（百分比）,
+      "suspicious_keywords": [已擷取詞列表]
+    }
+    """
     label, prob = predict_single_sentence(text)
     prob_percent = round(prob * 100, 2)
     status = "詐騙" if label == 1 else "正常"
         "confidence": prob_percent,
         "suspicious_keywords": suspicious or ["（模型未聚焦可疑詞）"]
     }
+# ─────────────────────────────────────────────────────────────────────────────
+# ─────────────────────────────────────────────────────────────────────────────
+# 以下新增：OCR 前處理＋圖片分析相關函式
+# 8. 前處理：將圖片做灰階→CLAHE→HSV過濾→二值化→放大→模糊，回傳可供 OCR 的二值化 NumPy 圖
+def preprocess_for_easyocr(pil_image: Image.Image) -> np.ndarray:
+    """
+    將 PIL Image 做以下前處理，回傳黑底白字的二值化 NumPy 圖：
+    1. PIL→NumPy (RGB→BGR)
+    2. 轉灰階 + CLAHE(對比度增強)
+    3. HSV 色彩過濾 (示範過濾「橘色」海報底色)
+    4. 固定阈值反向二值化 (深色文字→白，其他→黑)
+    5. 放大2倍 + GaussianBlur 模糊
+    """
+    # 8.1. PIL→NumPy (RGB to BGR)
+    img_bgr = np.array(pil_image.convert("RGB"))[:, :, ::-1]
+    # 8.2. 轉灰階
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    # 8.3. CLAHE (對比度限制自適應直方圖均衡)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(gray)
+    # 8.4. HSV 色彩過濾 (此範例針對橘色底色)
+    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+    lower_orange = np.array([5, 100, 100])
+    upper_orange = np.array([20, 255, 255])
+    mask_orange = cv2.inRange(hsv, lower_orange, upper_orange)
+    filtered = enhanced.copy()
+    filtered[mask_orange > 0] = 255  # 將橘色背景設為白
+    # 8.5. 固定阈值反向二值化 (深色文字→白，背景→黑)
+    _, thresh = cv2.threshold(filtered, 200, 255, cv2.THRESH_BINARY_INV)
+    # 8.6. 放大2倍 & GaussianBlur 平滑
+    scaled = cv2.resize(thresh, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
+    smoothed = cv2.GaussianBlur(scaled, (3, 3), 0)
+    return smoothed
+# 9. 圖片分析：OCR 讀取前處理後影像 → 拼接文字 → BERT 分析
+def analyze_image(file_bytes, explain_mode="cnn"):
+    """
+    輸入圖片 bytes，返回：
+    {
+      "status": "詐騙"/"正常"/"無法辨識文字",
+      "confidence": float,
+      "suspicious_keywords": [詞列表]
+    }
+    流程：
+      1. bytes → PIL Image
+      2. 前處理 → 得到二值化 NumPy 圖 (黑底白字)
+      3. EasyOCR 讀取，組成一段長文字
+      4. 沒讀到文字 → 回傳「無法辨識」；有文字 → 呼叫 analyze_text
+    """
+    # 9.1. bytes → PIL Image
+    image = Image.open(io.BytesIO(file_bytes))
+    # 9.2. OCR 前處理：得到 NumPy 二值化圖 (黑底白字)
+    preprocessed_np = preprocess_for_easyocr(image)
+    # 【可選 Debug】把前處理後的圖存到 /tmp/debug_processed.png，方便下載檢查
+    # Image.fromarray(preprocessed_np).save("/tmp/debug_processed.png")
+    # 9.3. 呼叫 EasyOCR 讀取前處理後影像
+    reader = easyocr.Reader(['ch_tra', 'en'], gpu=torch.cuda.is_available())
+    results = reader.readtext(preprocessed_np)
+    # 9.4. 合併所有識別到的文字
+    text = ' '.join([res[1] for res in results]).strip()
+    # 9.5. 如果沒讀到任何文字，直接回傳「無法辨識」
+    if not text:
+        return {
+            "status": "無法辨識文字",
+            "confidence": 0.0,
+            "suspicious_keywords": ["圖片中無可辨識的中文英文"]
+        }
+    # 9.6. 如果擷取到文字，交給 analyze_text 做 BERT 分析
+    return analyze_text(text, explain_mode=explain_mode)
+# ─────────────────────────────────────────────────────────────────────────────