Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on Sep 24

Commit

3bd60c1

verified ·

1 Parent(s): 4038d7a

Update body_analyzer.py

Browse files

Files changed (1) hide show

body_analyzer.py +82 -82

body_analyzer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# body_analyzer_v2.py
 import os
 import re
 import requests
@@ -8,31 +8,45 @@ HF_API_KEY = os.getenv("HF_API_KEY")
 HF_HEADERS = {"Authorization": f"Bearer {HF_API_KEY}"} if HF_API_KEY else {}
 HF_TIMEOUT = 20  # seconds
-# Hugging Face model names
-PHISHING_MODELS = [
-    "cybersectony/phishing-email-detection-distilbert_v2.4.1",
-    "ealvaradob/bert-finetuned-phishing"
-]
 ZERO_SHOT_MODEL = "facebook/bart-large-mnli"  # for intent/behavior
 # Suspicious phrase patterns
 SUSPICIOUS_PATTERNS = [
-    "verify your account", "urgent action", "click here", "reset password",
-    "confirm your identity", "bank account", "invoice", "payment required",
-    "unauthorized login", "compromised", "final reminder", "account suspended",
-    "account deactivated", "update your information", "legal action",
-    "limited time offer", "claim your prize", "verify immediately",
-    "verify now", "verify your credentials",
 ]
-# zero-shot candidate labels for message behavior
 BEHAVIOR_LABELS = [
-    "credential harvesting", "invoice/payment fraud", "marketing",
-    "benign", "malware", "account takeover",
 ]
 def _call_hf_text_model(model_name: str, text: str):
-    """Call HF Inference API for text classification"""
     if not HF_API_KEY:
         return None
     try:
@@ -48,7 +62,6 @@ def _call_hf_text_model(model_name: str, text: str):
         return None
 def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
-    """Zero-shot classification for email behavior/intent"""
     if not HF_API_KEY:
         return None
     try:
@@ -63,35 +76,39 @@ def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
     except Exception:
         return None
-def _parse_hf_model_output(result):
-    """Extract label and confidence from HF output"""
     if not result:
-        return None, 0.0
-    if isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict):
-        label = result[0].get("label")
-        score = result[0].get("score", 0.0)
-        return label, float(score or 0.0)
-    if isinstance(result, dict) and "labels" in result and "scores" in result:
-        return result["labels"][0], float(result["scores"][0])
-    return None, 0.0
 def analyze_body(subject: str, body: str, urls: list, images: list):
     findings = []
     score = 0
-    highlighted_body = body or ""
-    combined_text = f"{subject}\n{body}".lower()
-    # 1) Basic heuristics: suspicious phrases
     for pattern in SUSPICIOUS_PATTERNS:
-        if pattern in combined_text:
             findings.append(f"Suspicious phrase detected: \"{pattern}\"")
-            score += 30 if pattern in (subject or "").lower() else 18
             try:
                 highlighted_body = re.sub(re.escape(pattern), f"<mark>{pattern}</mark>", highlighted_body, flags=re.IGNORECASE)
             except Exception:
                 pass
-    # 2) URL heuristics
     for u in urls or []:
         findings.append(f"Suspicious URL detected: {u}")
         score += 10
@@ -99,47 +116,41 @@ def analyze_body(subject: str, body: str, urls: list, images: list):
             highlighted_body = re.sub(re.escape(u), f"<mark>{u}</mark>", highlighted_body, flags=re.IGNORECASE)
         except Exception:
             pass
-        domain_match = re.search(r"https?://([^/]+)/?", u)
-        if domain_match:
-            domain = domain_match.group(1)
-            if len(domain) > 25 or any(ch.isdigit() for ch in domain.split(".")[0]):
-                findings.append(f"URL: suspicious-looking domain {domain}")
-                score += 10
-    # 3) ML Phishing detection using multiple HF models
-    ml_labels = []
-    ml_confidences = []
-    model_input = "\n".join([subject or "", body or ""] + (urls or []))
-    for phish_model in PHISHING_MODELS:
-        if HF_API_KEY and model_input:
-            result = _call_hf_text_model(phish_model, model_input)
-            label, conf = _parse_hf_model_output(result)
-            if label:
-                findings.append(f"HF phishing model ({phish_model}) → {label} (conf {conf:.2f})")
-                ml_labels.append(label)
-                ml_confidences.append(conf)
-    # Take the max confidence phishing prediction
-    if ml_confidences:
-        max_idx = ml_confidences.index(max(ml_confidences))
-        if "phish" in (ml_labels[max_idx] or "").lower():
-            score += int(ml_confidences[max_idx] * 100 * 0.9)
-    # 4) Zero-shot intent/behavior classification
-    behavior_label = None
     behavior_conf = 0.0
     if HF_API_KEY and model_input:
         zs = _call_hf_zero_shot(model_input, BEHAVIOR_LABELS)
-        if isinstance(zs, dict) and "labels" in zs and "scores" in zs:
-            behavior_label = zs["labels"][0]
-            behavior_conf = float(zs["scores"][0])
-            findings.append(f"Behavior inference → {behavior_label} (conf {behavior_conf:.2f})")
-            if behavior_conf >= 0.7:
-                score += int(behavior_conf * 30)
-    # 5) Final score clamping
-    score = max(0, min(score, 100))
-    # 6) Verdict
     if score >= 70:
         verdict = "🚨 Malicious"
     elif 50 <= score < 70:
@@ -150,16 +161,5 @@ def analyze_body(subject: str, body: str, urls: list, images: list):
         verdict = "✅ Safe"
         findings.append("No strong phishing signals detected by models/heuristics.")
-    # 7) Richer textual summary (like your example)
-    summary = f"""
-Email analysis summary:
-- Subject: {subject}
-- Body length: {len(body)} chars
-- Detected behavior/intent: {behavior_label} (conf {behavior_conf:.2f})
-- Top phishing alert: {ml_labels[max_idx] if ml_labels else 'None'}
-- Suspicious phrases found: {len([f for f in findings if 'Suspicious phrase' in f])}
-- Total score: {score}/100
-Verdict: {verdict}
-"""
-    return findings, score, highlighted_body, verdict, summary

+# body_analyzer.py
 import os
 import re
 import requests
 HF_HEADERS = {"Authorization": f"Bearer {HF_API_KEY}"} if HF_API_KEY else {}
 HF_TIMEOUT = 20  # seconds
+# ML model names
+PHISHING_MODEL = "cybersectony/phishing-email-detection-distilbert_v2.4.1"
 ZERO_SHOT_MODEL = "facebook/bart-large-mnli"  # for intent/behavior
 # Suspicious phrase patterns
 SUSPICIOUS_PATTERNS = [
+    "verify your account",
+    "urgent action",
+    "click here",
+    "reset password",
+    "confirm your identity",
+    "bank account",
+    "invoice",
+    "payment required",
+    "unauthorized login",
+    "compromised",
+    "final reminder",
+    "account suspended",
+    "account deactivated",
+    "update your information",
+    "legal action",
+    "limited time offer",
+    "claim your prize",
+    "verify immediately",
+    "verify now",
+    "verify your credentials",
 ]
+# Zero-shot candidate labels for intent/behavior
 BEHAVIOR_LABELS = [
+    "credential harvesting",
+    "invoice/payment fraud",
+    "marketing",
+    "benign",
+    "malware",
+    "account takeover",
 ]
 def _call_hf_text_model(model_name: str, text: str):
     if not HF_API_KEY:
         return None
     try:
         return None
 def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
     if not HF_API_KEY:
         return None
     try:
     except Exception:
         return None
+def _parse_hf_phishing_model_output(result):
     if not result:
+        return None, 0.0, {}
+    if isinstance(result, list) and result and isinstance(result[0], dict):
+        r0 = result[0]
+        label = r0.get("label")
+        score = r0.get("score", 0.0)
+        return label, float(score), {label: float(score)}
+    if isinstance(result, dict):
+        labels = result.get("labels") or result.get("label") or []
+        scores = result.get("scores") or result.get("score") or []
+        if isinstance(labels, list) and isinstance(scores, list) and labels and scores:
+            all_probs = {lab: float(sc) for lab, sc in zip(labels, scores)}
+            max_lab = max(all_probs.items(), key=lambda x: x[1])
+            return max_lab[0], float(max_lab[1]), all_probs
+    return None, 0.0, {}
 def analyze_body(subject: str, body: str, urls: list, images: list):
     findings = []
     score = 0
+    highlighted_body = (body or "")
+    combined_lower = ((subject or "") + "\n" + (body or "")).lower()
     for pattern in SUSPICIOUS_PATTERNS:
+        if pattern in combined_lower:
             findings.append(f"Suspicious phrase detected: \"{pattern}\"")
+            score += 18
             try:
                 highlighted_body = re.sub(re.escape(pattern), f"<mark>{pattern}</mark>", highlighted_body, flags=re.IGNORECASE)
             except Exception:
                 pass
+    # URL checks
     for u in urls or []:
         findings.append(f"Suspicious URL detected: {u}")
         score += 10
             highlighted_body = re.sub(re.escape(u), f"<mark>{u}</mark>", highlighted_body, flags=re.IGNORECASE)
         except Exception:
             pass
+    # ML phishing model
+    ml_label = None
+    ml_conf = 0.0
+    model_input = "\n".join([subject or "", body or "", "\n".join(urls or [])]).strip()
+    if model_input and HF_API_KEY:
+        raw = _call_hf_text_model(PHISHING_MODEL, model_input)
+        label, conf, _ = _parse_hf_phishing_model_output(raw)
+        if label:
+            ml_label = label
+            ml_conf = conf
+            findings.append(f"HuggingFace phishing model → {label} (conf {conf:.2f})")
+            score += int(conf * 100 * 0.9)
+    # Zero-shot behavior
+    behavior = None
     behavior_conf = 0.0
     if HF_API_KEY and model_input:
         zs = _call_hf_zero_shot(model_input, BEHAVIOR_LABELS)
+        try:
+            if isinstance(zs, dict) and "labels" in zs and "scores" in zs:
+                behavior = zs["labels"][0]
+                behavior_conf = float(zs["scores"][0])
+                findings.append(f"Behavior inference → {behavior} (conf {behavior_conf:.2f})")
+                if behavior_conf >= 0.7:
+                    score += int(behavior_conf * 30)
+        except Exception:
+            pass
+    if ml_conf >= 0.8 and ("phishing" in (ml_label or "").lower()):
+        score = max(score, 80)
+    score = int(max(0, min(score, 100)))
+    # Verdict
     if score >= 70:
         verdict = "🚨 Malicious"
     elif 50 <= score < 70:
         verdict = "✅ Safe"
         findings.append("No strong phishing signals detected by models/heuristics.")
+    # Return exactly 4 values
+    return findings, score, highlighted_body, verdict