Spaces:

prashanth135
/

phishguard-api

Running

App Files Files Community

prashanth135 commited on 22 days ago

Commit

bebe233

verified ·

1 Parent(s): 056ea3d

Upload 38 files

Browse files

Files changed (38) hide show

Dockerfile +45 -0
__init__.py +1 -0
admin.html +250 -0
background.js +604 -0
bert_analyzer.py +375 -0
bert_finetune.py +216 -0
cnn_inference.py +237 -0
cnn_model.py +121 -0
config.json +35 -0
content.js +180 -0
data_collector.py +364 -0
domain_graph_builder.py +303 -0
email_analyzer.py +191 -0
feedback_store.py +223 -0
generate_icons.py +98 -0
gmail_scanner.js +193 -0
gnn_inference.py +274 -0
gnn_model.py +183 -0
keep_alive.py +50 -0
main.py +699 -0
manifest.json +37 -0
popup.html +432 -0
popup.js +332 -0
render.yaml +34 -0
requirements.txt +18 -0
retraining_service.py +295 -0
screenshot_collector.py +172 -0
screenshot_hasher.py +214 -0
special_tokens_map.json +37 -0
test_endpoint.py +34 -0
tier3_bert_gnn.py +153 -0
tokenizer.json +0 -0
tokenizer_config.json +55 -0
train_cnn.py +277 -0
train_gnn.py +228 -0
url_heuristics.py +326 -0
visual_analyzer.py +512 -0
vocab.txt +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
+    HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Create a non-root user for Hugging Face security
+RUN useradd -m -u 1000 user
+WORKDIR /home/user/app
+# Install system dependencies required for Playwright and ML libs
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget curl libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 \
+    libcups2 libdrm2 libdbus-1-3 libxcb1 libxkbcommon0 libx11-6 libxcomposite1 \
+    libxdamage1 libxext6 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
+    && rm -rf /var/lib/apt/lists/*
+# Install PyTorch CPU
+RUN pip install --no-cache-dir torch==2.2.2 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cpu
+# Copy requirements and install
+COPY --chown=user requirements.txt .
+RUN grep -v "torch==" requirements.txt | grep -v "torchvision==" > req_filtered.txt && \
+    pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r req_filtered.txt
+# Install Playwright browser
+RUN playwright install chromium
+# Copy project files
+COPY --chown=user . .
+# Create necessary directories and set permissions
+RUN mkdir -p data logs bert_weights gnn cnn && \
+    chmod -R 777 data logs bert_weights
+# Expose the Hugging Face default port
+EXPOSE 7860
+# Command to run on start (Port 7860 is required for HF)
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # PhishGuard AI - CNN Module

admin.html ADDED Viewed

	@@ -0,0 +1,250 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>PhishGuard Admin</title>
+<style>
+  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
+  * { margin:0; padding:0; box-sizing:border-box; }
+  :root {
+    --bg: #0F0F14; --bg2: #1A1A24; --card: #22222E; --border: rgba(255,255,255,0.06);
+    --text: #EAEAF0; --text2: #8888A0; --accent: #534AB7;
+    --safe: #22C55E; --danger: #EF4444; --warn: #F59E0B;
+  }
+  body { font-family:'Inter',sans-serif; background:var(--bg); color:var(--text); min-height:100vh; }
+  /* Login */
+  .login-wrap { display:flex; align-items:center; justify-content:center; min-height:100vh; }
+  .login-box { background:var(--card); padding:36px; border-radius:16px; border:1px solid var(--border); width:340px; }
+  .login-box h2 { font-size:20px; margin-bottom:20px; text-align:center; }
+  .login-box input { width:100%; padding:10px 14px; background:var(--bg2); border:1px solid var(--border);
+    border-radius:8px; color:var(--text); font-size:14px; font-family:inherit; margin-bottom:12px; outline:none; }
+  .login-box input:focus { border-color:var(--accent); }
+  .login-box button { width:100%; padding:10px; background:linear-gradient(135deg,var(--accent),#6C5ECE);
+    border:none; border-radius:8px; color:#fff; font-size:14px; font-weight:600; cursor:pointer; font-family:inherit; }
+  .login-box button:hover { opacity:0.9; }
+  .login-error { color:var(--danger); font-size:12px; text-align:center; margin-top:8px; display:none; }
+  /* Dashboard */
+  .dashboard { display:none; max-width:1000px; margin:0 auto; padding:24px; }
+  .dash-header { display:flex; align-items:center; gap:12px; margin-bottom:24px; }
+  .dash-header h1 { font-size:22px; flex:1; }
+  .dash-header h1 span { color:var(--accent); }
+  .logout-btn { padding:6px 16px; background:var(--card); border:1px solid var(--border);
+    border-radius:8px; color:var(--text2); font-size:12px; cursor:pointer; font-family:inherit; }
+  /* Stats Cards */
+  .stats { display:grid; grid-template-columns:repeat(auto-fit,minmax(180px,1fr)); gap:12px; margin-bottom:24px; }
+  .stat-card { background:var(--card); border:1px solid var(--border); border-radius:12px; padding:16px; }
+  .stat-label { font-size:11px; color:var(--text2); text-transform:uppercase; letter-spacing:0.5px; }
+  .stat-value { font-size:28px; font-weight:700; margin-top:4px; }
+  .stat-sub { font-size:11px; color:var(--text2); margin-top:2px; }
+  /* Table */
+  .section-title { font-size:16px; font-weight:600; margin-bottom:12px; }
+  .table-wrap { background:var(--card); border:1px solid var(--border); border-radius:12px; overflow:hidden; margin-bottom:24px; }
+  table { width:100%; border-collapse:collapse; font-size:12px; }
+  th { background:var(--bg2); padding:10px 14px; text-align:left; font-weight:600; color:var(--text2);
+    text-transform:uppercase; letter-spacing:0.5px; font-size:10px; }
+  td { padding:10px 14px; border-top:1px solid var(--border); }
+  tr:hover td { background:rgba(255,255,255,0.02); }
+  .badge { display:inline-block; padding:2px 8px; border-radius:4px; font-size:10px; font-weight:600; }
+  .badge-phish { background:rgba(239,68,68,0.12); color:var(--danger); }
+  .badge-safe { background:rgba(34,197,94,0.12); color:var(--safe); }
+  .url-cell { max-width:300px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; font-family:'SF Mono',monospace; font-size:11px; color:var(--text2); }
+  /* Retrain Button */
+  .retrain-bar { display:flex; align-items:center; gap:12px; margin-bottom:24px; }
+  .retrain-btn { padding:10px 24px; background:linear-gradient(135deg,var(--accent),#6C5ECE);
+    border:none; border-radius:8px; color:#fff; font-size:13px; font-weight:600; cursor:pointer; font-family:inherit; }
+  .retrain-btn:hover { opacity:0.9; }
+  .retrain-btn:disabled { opacity:0.4; cursor:not-allowed; }
+  .retrain-status { font-size:12px; color:var(--text2); }
+  /* History */
+  .history-card { background:var(--card); border:1px solid var(--border); border-radius:12px; padding:16px; margin-bottom:8px;
+    display:flex; align-items:center; gap:16px; }
+  .hist-version { font-size:22px; font-weight:700; color:var(--accent); min-width:48px; text-align:center; }
+  .hist-detail { flex:1; }
+  .hist-detail div:first-child { font-size:13px; font-weight:500; }
+  .hist-detail div:last-child { font-size:11px; color:var(--text2); margin-top:2px; }
+  .hist-accuracy { font-size:14px; font-weight:600; }
+</style>
+</head>
+<body>
+<!-- Login Screen -->
+<div class="login-wrap" id="loginScreen">
+  <div class="login-box">
+    <h2>🛡️ PhishGuard Admin</h2>
+    <input type="password" id="passInput" placeholder="Admin password" autocomplete="off">
+    <button onclick="attemptLogin()">Login</button>
+    <div class="login-error" id="loginError">Invalid password</div>
+  </div>
+</div>
+<!-- Dashboard (hidden until login) -->
+<div class="dashboard" id="dashboard">
+  <div class="dash-header">
+    <h1>Phish<span>Guard</span> Admin</h1>
+    <button class="logout-btn" onclick="logout()">Logout</button>
+  </div>
+  <!-- Stats -->
+  <div class="stats">
+    <div class="stat-card">
+      <div class="stat-label">Total Feedback</div>
+      <div class="stat-value" id="sTotalFeedback">—</div>
+    </div>
+    <div class="stat-card">
+      <div class="stat-label">Phishing Reports</div>
+      <div class="stat-value" id="sPhishing" style="color:var(--danger)">—</div>
+    </div>
+    <div class="stat-card">
+      <div class="stat-label">Safe Reports</div>
+      <div class="stat-value" id="sSafe" style="color:var(--safe)">—</div>
+    </div>
+    <div class="stat-card">
+      <div class="stat-label">Model Version</div>
+      <div class="stat-value" id="sVersion" style="color:var(--accent)">—</div>
+      <div class="stat-sub" id="sLastRetrain">Never retrained</div>
+    </div>
+    <div class="stat-card">
+      <div class="stat-label">Unprocessed</div>
+      <div class="stat-value" id="sUnprocessed" style="color:var(--warn)">—</div>
+      <div class="stat-sub">of 50 needed</div>
+    </div>
+  </div>
+  <!-- Manual Retrain -->
+  <div class="retrain-bar">
+    <button class="retrain-btn" id="retrainBtn" onclick="triggerRetrain()">🔄 Trigger Retraining</button>
+    <div class="retrain-status" id="retrainStatus"></div>
+  </div>
+  <!-- Recent Feedback -->
+  <div class="section-title">Recent Feedback</div>
+  <div class="table-wrap">
+    <table>
+      <thead>
+        <tr><th>URL</th><th>Label</th><th>Source</th><th>Prediction</th><th>Time</th></tr>
+      </thead>
+      <tbody id="feedbackTable"><tr><td colspan="5" style="text-align:center;color:var(--text2)">Loading...</td></tr></tbody>
+    </table>
+  </div>
+  <!-- Retrain History -->
+  <div class="section-title">Retrain History</div>
+  <div id="historyList"><div style="color:var(--text2);font-size:12px">Loading...</div></div>
+</div>
+<script>
+const BASE = window.location.origin;
+let authToken = "";
+// Login
+function attemptLogin() {
+  const pass = document.getElementById("passInput").value;
+  fetch(`${BASE}/admin/login`, {
+    method: "POST",
+    headers: {"Content-Type":"application/json"},
+    body: JSON.stringify({password: pass})
+  })
+  .then(r => r.json())
+  .then(data => {
+    if (data.success) {
+      authToken = data.token;
+      document.getElementById("loginScreen").style.display = "none";
+      document.getElementById("dashboard").style.display = "block";
+      loadDashboard();
+    } else {
+      document.getElementById("loginError").style.display = "block";
+    }
+  })
+  .catch(() => { document.getElementById("loginError").style.display = "block"; });
+}
+document.getElementById("passInput").addEventListener("keyup", e => { if(e.key==="Enter") attemptLogin(); });
+function logout() {
+  authToken = "";
+  document.getElementById("loginScreen").style.display = "flex";
+  document.getElementById("dashboard").style.display = "none";
+}
+// Dashboard data
+function loadDashboard() {
+  // Stats
+  fetch(`${BASE}/admin/data?token=${authToken}`).then(r=>r.json()).then(data => {
+    if (data.error) { logout(); return; }
+    const s = data.stats;
+    document.getElementById("sTotalFeedback").textContent = s.total_feedback;
+    document.getElementById("sPhishing").textContent = s.phishing_corrections;
+    document.getElementById("sSafe").textContent = s.safe_corrections;
+    document.getElementById("sVersion").textContent = "v" + s.model_version;
+    document.getElementById("sUnprocessed").textContent = s.unprocessed_count;
+    document.getElementById("sLastRetrain").textContent = s.last_retrain
+      ? "Last: " + new Date(s.last_retrain).toLocaleString() : "Never retrained";
+    // Feedback table
+    const rows = data.recent.map(e => `
+      <tr>
+        <td class="url-cell" title="${esc(e.url)}">${esc(e.url)}</td>
+        <td><span class="badge ${e.label==='phishing'?'badge-phish':'badge-safe'}">${esc(e.label)}</span></td>
+        <td>${esc(e.source||'—')}</td>
+        <td>${e.original_prediction!=null ? (e.original_prediction*100).toFixed(0)+'%' : '—'}</td>
+        <td style="font-size:11px;color:var(--text2)">${e.timestamp ? new Date(e.timestamp).toLocaleString() : '—'}</td>
+      </tr>
+    `).join("");
+    document.getElementById("feedbackTable").innerHTML = rows || '<tr><td colspan="5" style="text-align:center;color:var(--text2)">No feedback yet</td></tr>';
+    // History
+    const hist = (s.retrain_history || []).reverse();
+    if (hist.length === 0) {
+      document.getElementById("historyList").innerHTML = '<div style="color:var(--text2);font-size:12px">No retraining history</div>';
+    } else {
+      document.getElementById("historyList").innerHTML = hist.map(h => `
+        <div class="history-card">
+          <div class="hist-version">v${h.version}</div>
+          <div class="hist-detail">
+            <div>Trained on ${h.samples} samples</div>
+            <div>${new Date(h.timestamp).toLocaleString()}</div>
+          </div>
+          <div class="hist-accuracy" style="color:${h.accuracy>=0.8?'var(--safe)':h.accuracy>=0.6?'var(--warn)':'var(--danger)'}">
+            ${(h.accuracy*100).toFixed(1)}%
+          </div>
+        </div>
+      `).join("");
+    }
+  });
+}
+// Retrain
+function triggerRetrain() {
+  const btn = document.getElementById("retrainBtn");
+  btn.disabled = true;
+  btn.textContent = "⏳ Retraining...";
+  document.getElementById("retrainStatus").textContent = "Training in progress...";
+  fetch(`${BASE}/admin/retrain?token=${authToken}`, {method:"POST"})
+  .then(r=>r.json())
+  .then(data => {
+    document.getElementById("retrainStatus").textContent = data.message || "Done";
+    btn.disabled = false;
+    btn.textContent = "🔄 Trigger Retraining";
+    setTimeout(loadDashboard, 2000);
+  })
+  .catch(e => {
+    document.getElementById("retrainStatus").textContent = "Error: " + e.message;
+    btn.disabled = false;
+    btn.textContent = "🔄 Trigger Retraining";
+  });
+}
+function esc(s) { const d=document.createElement('div'); d.textContent=String(s||''); return d.innerHTML; }
+// Auto-refresh every 30s
+setInterval(() => { if(authToken) loadDashboard(); }, 30000);
+</script>
+</body>
+</html>

background.js ADDED Viewed

	@@ -0,0 +1,604 @@

+// ============================================================
+// PhishGuard AI - background.js
+// MV3 Service Worker with feedback, retraining triggers, and
+// model version polling.
+//
+// State (chrome.storage.local):
+//   phishguard_feedback_queue: FeedbackRecord[] (max 500, FIFO)
+//   scan_count: int (resets at 50)
+//   feedback_count: int (labeled samples since last retrain)
+//   last_retrain_ts: ISO8601
+//   model_version: int
+//   session_id: UUIDv4
+//
+// Triggers:
+//   1. scan_count >= 50 AND feedback_count >= 10
+//   2. chrome.alarms "retrain_alarm" (24h) AND feedback_count >= 10
+// ============================================================
+// ── Backend URL ──────────────────────────────────────────────────────
+const BACKEND_URL = "https://phishguard-api-z2wj.onrender.com";
+const ANALYZE_URL = `${BACKEND_URL}/analyze`;
+const RETRAIN_URL = `${BACKEND_URL}/retrain`;
+const MODEL_VERSION_URL = `${BACKEND_URL}/model_version`;
+// ── Constants ────────────────────────────────────────────────────────
+const CACHE_TTL_MS = 30 * 60 * 1000;
+const MAX_QUEUE_SIZE = 500;
+const RETRAIN_URL_THRESHOLD = 50;
+const MIN_LABELED_SAMPLES = 10;
+// ── In-memory caches ─────────────────────────────────────────────────
+const urlCache = new Map();
+const tabResultCache = new Map();
+const pageSignals = new Map();
+// ── TIER 1: Whitelist (O(1) Set lookup) ──────────────────────────────
+const WHITELIST = new Set([
+  "google.com","youtube.com","facebook.com","amazon.com","wikipedia.org",
+  "twitter.com","instagram.com","linkedin.com","microsoft.com","apple.com",
+  "github.com","stackoverflow.com","reddit.com","netflix.com","paypal.com",
+  "bankofamerica.com","chase.com","wellsfargo.com","yahoo.com","bing.com",
+  "outlook.com","office.com","live.com","adobe.com","dropbox.com",
+  "zoom.us","slack.com","spotify.com","twitch.tv","ebay.com",
+  "walmart.com","target.com","bestbuy.com","airbnb.com",
+  "x.com","tiktok.com","pinterest.com","quora.com","medium.com"
+]);
+function getRootDomain(url) {
+  try {
+    const host = new URL(url).hostname.replace(/^www\./, "");
+    const parts = host.split(".");
+    return parts.slice(-2).join(".");
+  } catch { return null; }
+}
+// ── TIER 2: Local heuristic scoring ──────────────────────────────────
+function heuristicScore(url) {
+  let score = 0;
+  const signals = [];
+  const u = url.toLowerCase();
+  // IP as hostname (25 pts)
+  if (/https?:\/\/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/.test(url)) {
+    score += 25; signals.push("IP as hostname");
+  }
+  // Suspicious TLD (20 pts)
+  const badTLDs = [".xyz",".tk",".ml",".ga",".cf",".gq",".pw",".top",".click"];
+  for (const tld of badTLDs) {
+    if (u.includes(tld)) { score += 20; signals.push(`Suspicious TLD (${tld})`); break; }
+  }
+  // Phishing keywords (15 pts)
+  const keywords = ["login","verify","secure","update","account","banking",
+    "signin","reset","confirm","suspend","webscr","cmd","payment","alert"];
+  const kwHits = keywords.filter(kw => u.includes(kw));
+  if (kwHits.length > 0) { score += 15; signals.push(`Keywords: ${kwHits.join(", ")}`); }
+  // Brand spoofing (15 pts)
+  const brands = ["paypal","google","apple","microsoft","amazon","netflix",
+    "facebook","instagram","chase","wellsfargo","bankofamerica"];
+  try {
+    const domain = getRootDomain(url);
+    for (const brand of brands) {
+      if (u.includes(brand) && domain && !domain.startsWith(brand)) {
+        score += 15; signals.push(`Brand spoofing: ${brand}`); break;
+      }
+    }
+  } catch {}
+  // Excessive subdomains (10 pts)
+  try {
+    const host = new URL(url).hostname;
+    const subCount = host.split(".").length - 2;
+    if (subCount >= 3) { score += 10; signals.push(`${subCount} subdomains`); }
+  } catch {}
+  // URL length (5 pts)
+  if (url.length > 100) { score += 5; signals.push(`Long URL (${url.length} chars)`); }
+  // Hyphens (5 pts)
+  try {
+    const host = new URL(url).hostname;
+    const hyphens = (host.match(/-/g) || []).length;
+    if (hyphens >= 3) { score += 5; signals.push(`${hyphens} hyphens in domain`); }
+  } catch {}
+  // Non-standard port (5 pts)
+  try {
+    const port = new URL(url).port;
+    if (port && port !== "80" && port !== "443") {
+      score += 5; signals.push(`Non-standard port :${port}`);
+    }
+  } catch {}
+  return { score: Math.min(score, 100), signals };
+}
+// ── URL Cache ────────────────────────────────────────────────────────
+function getCached(url) {
+  const entry = urlCache.get(url);
+  if (!entry) return null;
+  if (Date.now() - entry.ts > CACHE_TTL_MS) { urlCache.delete(url); return null; }
+  return entry.result;
+}
+function setCache(url, result) {
+  urlCache.set(url, { result, ts: Date.now() });
+  if (urlCache.size > 500) {
+    const firstKey = urlCache.keys().next().value;
+    urlCache.delete(firstKey);
+  }
+}
+// ── Badge ────────────────────────────────────────────────────────────
+function setBadge(tabId, status, text) {
+  const colors = {
+    safe: "#22C55E", blocked: "#EF4444", warn: "#F59E0B",
+    loading: "#534AB7", none: "#888888"
+  };
+  chrome.action.setBadgeBackgroundColor({ color: colors[status] || colors.none, tabId });
+  chrome.action.setBadgeText({ text: text || "", tabId });
+}
+// ── Backend fetch with retry ─────────────────────────────────────────
+async function fetchBackend(url, payload, retryCount = 1) {
+  try {
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), 15000);
+    const response = await fetch(url, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(payload),
+      signal: controller.signal,
+    });
+    clearTimeout(timeout);
+    if (!response.ok) throw new Error(`Server ${response.status}`);
+    return await response.json();
+  } catch (err) {
+    if (retryCount > 0) {
+      await new Promise(r => setTimeout(r, 2000));
+      return fetchBackend(url, payload, retryCount - 1);
+    }
+    throw err;
+  }
+}
+// ── SHA256 hash ──────────────────────────────────────────────────────
+async function sha256(text) {
+  const encoded = new TextEncoder().encode(text);
+  const hash = await crypto.subtle.digest("SHA-256", encoded);
+  return Array.from(new Uint8Array(hash)).map(b => b.toString(16).padStart(2, "0")).join("");
+}
+// ── Storage helpers ──────────────────────────────────────────────────
+async function getStorage(keys) {
+  return new Promise(resolve => chrome.storage.local.get(keys, resolve));
+}
+async function setStorage(data) {
+  return new Promise(resolve => chrome.storage.local.set(data, resolve));
+}
+async function getQueue() {
+  const data = await getStorage(["phishguard_feedback_queue"]);
+  return data.phishguard_feedback_queue || [];
+}
+async function setQueue(queue) {
+  // FIFO eviction
+  if (queue.length > MAX_QUEUE_SIZE) {
+    queue = queue.slice(queue.length - MAX_QUEUE_SIZE);
+  }
+  await setStorage({ phishguard_feedback_queue: queue });
+}
+// ── ON INSTALL ───────────────────────────────────────────────────────
+chrome.runtime.onInstalled.addListener(async () => {
+  const sessionId = crypto.randomUUID();
+  await setStorage({
+    session_id: sessionId,
+    scan_count: 0,
+    feedback_count: 0,
+    last_retrain_ts: null,
+    model_version: 0,
+    phishguard_feedback_queue: [],
+  });
+  // 24-hour retraining alarm
+  chrome.alarms.create("retrain_alarm", { periodInMinutes: 1440 });
+  // 30-minute model polling alarm
+  chrome.alarms.create("model_poll_alarm", { periodInMinutes: 30 });
+  console.log("[PhishGuard] Installed. Session:", sessionId);
+});
+// ── ALARM HANDLERS ───────────────────────────────────────────────────
+chrome.alarms.onAlarm.addListener(async (alarm) => {
+  if (alarm.name === "retrain_alarm") {
+    console.log("[PhishGuard] Retrain alarm fired");
+    await checkRetrain("timer");
+  }
+  if (alarm.name === "model_poll_alarm") {
+    await pollModelVersion();
+  }
+});
+// ── MAIN URL LISTENER ────────────────────────────────────────────────
+chrome.webNavigation.onCompleted.addListener(async (details) => {
+  if (details.frameId !== 0) return;
+  const url = details.url;
+  if (!url.startsWith("http")) return;
+  const tabId = details.tabId;
+  const domain = getRootDomain(url);
+  if (!domain) return;
+  setBadge(tabId, "loading", "…");
+  // TIER 1: Whitelist
+  if (WHITELIST.has(domain)) {
+    const result = {
+      url, status: "safe", tier: 1, method: "whitelist",
+      confidence: 0, heuristic_score: 0, signals: []
+    };
+    await setStorage({ lastResult: result });
+    tabResultCache.set(tabId, result);
+    setBadge(tabId, "safe", "✓");
+    return;
+  }
+  // Cache check
+  const cached = getCached(url);
+  if (cached) {
+    await setStorage({ lastResult: cached });
+    tabResultCache.set(tabId, cached);
+    setBadge(tabId, cached.status, cached.status === "blocked" ? "!" : "✓");
+    if (cached.status === "blocked") blockPage(tabId, url, cached);
+    return;
+  }
+  // TIER 2: Heuristic
+  const hResult = heuristicScore(url);
+  if (hResult.score >= 80) {
+    const result = {
+      url, status: "blocked", tier: 2, method: "heuristic",
+      confidence: hResult.score / 100, heuristic_score: hResult.score,
+      signals: hResult.signals, is_phishing: true
+    };
+    setCache(url, result);
+    await setStorage({ lastResult: result });
+    tabResultCache.set(tabId, result);
+    setBadge(tabId, "blocked", "!");
+    blockPage(tabId, url, result);
+    await storeFeedbackRecord(url, result);
+    await incrementScanCount();
+    return;
+  }
+  // TIER 3+4: Send to backend
+  const signals = pageSignals.get(tabId) || {};
+  try {
+    const apiResult = await fetchBackend(ANALYZE_URL, {
+      url,
+      heuristic_score: hResult.score,
+      page_title: signals.title || "",
+      page_snippet: signals.snippet || "",
+    });
+    const finalResult = {
+      url,
+      status: apiResult.is_phishing ? "blocked" : "safe",
+      tier: apiResult.tier || 3,
+      method: apiResult.method || "ensemble",
+      confidence: apiResult.confidence || 0,
+      heuristic_score: apiResult.heuristic_score || hResult.score,
+      signals: apiResult.signals || hResult.signals,
+      is_phishing: apiResult.is_phishing,
+      details: apiResult.details || {},
+    };
+    setCache(url, finalResult);
+    await setStorage({ lastResult: finalResult });
+    tabResultCache.set(tabId, finalResult);
+    if (finalResult.status === "blocked") {
+      setBadge(tabId, "blocked", "!");
+      blockPage(tabId, url, finalResult);
+    } else if (finalResult.confidence >= 0.4) {
+      setBadge(tabId, "warn", "?");
+    } else {
+      setBadge(tabId, "safe", "✓");
+    }
+    await storeFeedbackRecord(url, finalResult);
+  } catch (err) {
+    console.log("[PhishGuard] Backend unreachable:", err.message);
+    const fallback = {
+      url,
+      status: hResult.score >= 50 ? "blocked" : "safe",
+      tier: 2,
+      method: "heuristic-fallback",
+      confidence: hResult.score / 100,
+      heuristic_score: hResult.score,
+      signals: hResult.signals,
+      is_phishing: hResult.score >= 50,
+      details: { backend_error: err.message },
+    };
+    setCache(url, fallback);
+    await setStorage({ lastResult: fallback });
+    tabResultCache.set(tabId, fallback);
+    if (hResult.score >= 50) {
+      setBadge(tabId, "blocked", "!");
+      blockPage(tabId, url, fallback);
+    } else if (hResult.score >= 30) {
+      setBadge(tabId, "warn", "?");
+    } else {
+      setBadge(tabId, "none", "");
+    }
+    await storeFeedbackRecord(url, fallback);
+  }
+  await incrementScanCount();
+  await checkRetrain("count");
+  pageSignals.delete(tabId);
+}, { url: [{ schemes: ["http", "https"] }] });
+// ── Feedback Record Storage ──────────────────────────────────────────
+async function storeFeedbackRecord(url, result) {
+  const urlHash = await sha256(url);
+  const record = {
+    url,
+    verdict: result.is_phishing ? "phishing" : "safe",
+    confidence: result.confidence || 0,
+    tier_used: result.tier || 0,
+    heuristic_score: result.heuristic_score || 0,
+    signals: result.signals || [],
+    user_feedback: null,
+    timestamp: new Date().toISOString(),
+    feedback_ts: null,
+    url_hash: urlHash,
+    session_id: (await getStorage(["session_id"])).session_id || "",
+  };
+  const queue = await getQueue();
+  queue.push(record);
+  await setQueue(queue);
+}
+async function incrementScanCount() {
+  const data = await getStorage(["scan_count"]);
+  await setStorage({ scan_count: (data.scan_count || 0) + 1 });
+}
+// ── Block Page ───────────────────────────────────────────────────────
+function blockPage(tabId, url, result) {
+  chrome.storage.local.set({ lastResult: { ...result, status: "blocked" } });
+  tabResultCache.set(tabId, result);
+  const score = Math.round((result.confidence || 0) * 100);
+  chrome.tabs.update(tabId, {
+    url: chrome.runtime.getURL("popup.html") +
+      "?blocked=1&url=" + encodeURIComponent(url) +
+      "&score=" + score +
+      "&method=" + encodeURIComponent(result.method || "")
+  });
+}
+// ── Retrain Check ────────────────────────────────────────────────────
+async function checkRetrain(trigger = "count") {
+  const queue = await getQueue();
+  const labeled = queue.filter(r => r.user_feedback !== null);
+  if (labeled.length < MIN_LABELED_SAMPLES) {
+    console.log(`[PhishGuard] Not enough labeled samples (${labeled.length}/${MIN_LABELED_SAMPLES})`);
+    return;
+  }
+  const data = await getStorage(["scan_count"]);
+  const scanCount = data.scan_count || 0;
+  if (trigger === "timer" || scanCount >= RETRAIN_URL_THRESHOLD) {
+    console.log(`[PhishGuard] Triggering retrain: trigger=${trigger}, labeled=${labeled.length}, scans=${scanCount}`);
+    await sendRetrainRequest(labeled, trigger);
+  }
+}
+async function sendRetrainRequest(samples, trigger) {
+  const data = await getStorage(["session_id"]);
+  try {
+    const result = await fetchBackend(RETRAIN_URL, {
+      samples,
+      trigger,
+      session_id: data.session_id || "",
+      extension_version: "3.0",
+    });
+    if (result.status === "success") {
+      // Reset counters
+      await setStorage({
+        scan_count: 0,
+        feedback_count: 0,
+        last_retrain_ts: new Date().toISOString(),
+      });
+      // Remove sent records from queue
+      const queue = await getQueue();
+      const sentHashes = new Set(samples.map(s => s.url_hash));
+      const remaining = queue.filter(r => !sentHashes.has(r.url_hash));
+      await setQueue(remaining);
+      // Show notification
+      showRetrainNotification(result.accuracy_delta || {});
+      console.log("[PhishGuard] Retrain success:", result);
+    }
+  } catch (err) {
+    console.error("[PhishGuard] Retrain request failed:", err.message);
+  }
+}
+function showRetrainNotification(delta) {
+  const bertDelta = delta.bert ? `BERT: ${(delta.bert * 100).toFixed(1)}%` : "";
+  const gnnDelta = delta.gnn ? `GNN: ${(delta.gnn * 100).toFixed(1)}%` : "";
+  const parts = [bertDelta, gnnDelta].filter(Boolean).join(", ");
+  chrome.notifications.create("retrain_complete", {
+    type: "basic",
+    iconUrl: "icons/icon48.png",
+    title: "PhishGuard AI Updated",
+    message: parts ? `Models improved! ${parts} accuracy from your feedback` :
+      "Models updated with your feedback",
+  });
+}
+// ── Model Version Polling ────────────────────────────────────────────
+async function pollModelVersion() {
+  try {
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), 10000);
+    const resp = await fetch(MODEL_VERSION_URL, { signal: controller.signal });
+    clearTimeout(timeout);
+    if (!resp.ok) return;
+    const info = await resp.json();
+    const stored = await getStorage(["model_version"]);
+    if (info.version > (stored.model_version || 0)) {
+      await setStorage({ model_version: info.version });
+      // Clear URL cache (stale results)
+      urlCache.clear();
+      chrome.notifications.create("model_updated", {
+        type: "basic",
+        iconUrl: "icons/icon48.png",
+        title: "PhishGuard Models Updated",
+        message: `Model v${info.version} is now active`,
+      });
+    }
+  } catch (err) {
+    // Silently fail — model polling is best-effort
+  }
+}
+// ── Message Handler ──────────────────────────────────────────────────
+chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => {
+  // Page signals from content.js
+  if (msg.type === "page_signals") {
+    if (sender.tab) {
+      pageSignals.set(sender.tab.id, {
+        title: msg.title || "",
+        snippet: msg.snippet || "",
+        signals: msg.signals || [],
+      });
+    }
+  }
+  // Submit feedback from popup.js / content.js
+  if (msg.type === "submit_feedback") {
+    (async () => {
+      const queue = await getQueue();
+      const idx = queue.findIndex(r => r.url_hash === msg.url_hash);
+      if (idx >= 0) {
+        queue[idx].user_feedback = msg.feedback; // "correct" or "incorrect"
+        queue[idx].feedback_ts = new Date().toISOString();
+        await setQueue(queue);
+        // Increment feedback count
+        const data = await getStorage(["feedback_count"]);
+        await setStorage({ feedback_count: (data.feedback_count || 0) + 1 });
+        // Check if we should trigger retraining
+        await checkRetrain("count");
+        sendResponse({ success: true });
+      } else {
+        sendResponse({ success: false, error: "Record not found" });
+      }
+    })();
+    return true; // async response
+  }
+  // Get status for popup
+  if (msg.type === "get_status") {
+    (async () => {
+      const data = await getStorage([
+        "scan_count", "feedback_count", "last_retrain_ts",
+        "model_version", "session_id"
+      ]);
+      const queue = await getQueue();
+      const labeled = queue.filter(r => r.user_feedback !== null).length;
+      const lastRetrain = data.last_retrain_ts ? new Date(data.last_retrain_ts) : null;
+      const now = Date.now();
+      const nextTimerMs = lastRetrain
+        ? Math.max(0, (24 * 60 * 60 * 1000) - (now - lastRetrain.getTime()))
+        : 24 * 60 * 60 * 1000;
+      sendResponse({
+        scan_count: data.scan_count || 0,
+        feedback_count: data.feedback_count || 0,
+        labeled_count: labeled,
+        last_retrain_ts: data.last_retrain_ts,
+        model_version: data.model_version || 0,
+        next_retrain_urls_remaining: Math.max(0, RETRAIN_URL_THRESHOLD - (data.scan_count || 0)),
+        next_retrain_time_remaining_ms: nextTimerMs,
+        min_labeled_needed: Math.max(0, MIN_LABELED_SAMPLES - labeled),
+      });
+    })();
+    return true;
+  }
+  // Per-tab result cache query from popup
+  if (msg.type === "get_tab_result") {
+    const result = tabResultCache.get(msg.tabId);
+    sendResponse({ result: result || null });
+    return false;
+  }
+  // User override (Proceed Anyway)
+  if (msg.type === "whitelist_url") {
+    const override = {
+      url: msg.url, status: "safe", tier: 0,
+      method: "user-override", confidence: 0
+    };
+    setCache(msg.url, override);
+    chrome.storage.local.set({ lastResult: override });
+    sendResponse({ success: true });
+  }
+  // Gmail scanner bridge
+  if (msg.action === "analyzeEmail") {
+    const emailURL = ANALYZE_URL.replace(/\/analyze\/?$/, "/analyze/email");
+    fetch(emailURL, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(msg.data),
+    })
+    .then(r => r.ok ? r.json() : Promise.reject(new Error(`${r.status}`)))
+    .then(data => sendResponse(data))
+    .catch(err => sendResponse({
+      status: "error",
+      analysis: { isPhishing: false, probability: 0, reason: "Backend unreachable" }
+    }));
+    return true;
+  }
+});
+// ── Tab cleanup ──────────────────────────────────────────────────────
+chrome.tabs.onRemoved.addListener(tabId => {
+  pageSignals.delete(tabId);
+  tabResultCache.delete(tabId);
+});
+chrome.tabs.onUpdated.addListener((tabId, changeInfo) => {
+  if (changeInfo.url) {
+    tabResultCache.delete(tabId);
+    setBadge(tabId, "none", "");
+  }
+});

bert_analyzer.py ADDED Viewed

	@@ -0,0 +1,375 @@

+# ============================================================
+# PhishGuard AI - bert_analyzer.py
+# Tier 3a: BERT NLP Phishing Classifier
+#
+# Model: ealvaradob/bert-finetuned-phishing (HuggingFace Hub)
+# Tokenization: split on [-./=?&_~%@] to preserve homoglyphs
+# Input: "URL: {tokenized_url}. Title: {title}. Content: {snippet}"
+# Output: P_bert ∈ [0,1]
+# Supports: load, predict, fine-tune, incremental_update, save/load
+# ============================================================
+from __future__ import annotations
+import re
+import math
+import logging
+import threading
+from pathlib import Path
+from typing import List, Tuple, Optional, Dict
+logger = logging.getLogger("phishguard.bert")
+# ── Model state ──────────────────────────────────────────────────────
+_classifier = None
+_tokenizer = None
+_model = None
+_use_bert: bool = False
+_bert_load_attempted: bool = False
+_bert_lock = threading.Lock()
+# Check if transformers library is installed
+_transformers_available: bool = False
+try:
+    import transformers as _tf_module
+    _transformers_available = True
+    logger.info("transformers library found — BERT will lazy-load on first call")
+except ImportError:
+    logger.info("transformers not installed — using keyword NLP fallback")
+# ── Phishing pattern databases (for keyword fallback) ────────────────
+PHISHING_TERMS = [
+    "verify your account", "suspended", "click here immediately",
+    "unusual activity", "confirm your identity", "limited time",
+    "your password has been", "unauthorized access", "act now",
+    "secure your account", "login credentials", "reset password immediately",
+    "your account will be", "verify your identity", "we noticed suspicious",
+]
+PHISHING_KEYWORDS = [
+    "login", "secure", "verify", "account", "update", "confirm",
+    "banking", "paypal", "signin", "password", "suspend", "alert",
+    "restore", "unusual", "limited", "expire", "urgent", "immediately",
+]
+BRAND_NAMES = [
+    "paypal", "google", "apple", "microsoft", "amazon", "netflix",
+    "facebook", "instagram", "twitter", "linkedin", "chase", "wells",
+    "bankofamerica", "citibank", "usps", "fedex", "ebay",
+]
+class BERTPhishingClassifier:
+    """
+    BERT-based phishing text classifier.
+    Wraps HuggingFace model with URL-aware tokenization.
+    """
+    DEFAULT_MODEL = "ealvaradob/bert-finetuned-phishing"
+    FALLBACK_MODEL = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
+    def __init__(self, model_name: Optional[str] = None) -> None:
+        self.model_name: str = model_name or self.DEFAULT_MODEL
+        self._pipeline = None
+        self._tokenizer = None
+        self._model = None
+        self._loaded: bool = False
+        self._lock = threading.Lock()
+        self._re_url_split = re.compile(r"[-./=?&_~%@:]+")
+    def load_model(self) -> None:
+        """Load BERT model from HuggingFace Hub with cache fallback."""
+        if self._loaded:
+            return
+        with self._lock:
+            if self._loaded:
+                return
+            if not _transformers_available:
+                logger.warning("transformers not available, BERT disabled")
+                return
+            try:
+                from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+                # Try primary model, fall back to smaller model
+                for model_id in [self.model_name, self.FALLBACK_MODEL]:
+                    try:
+                        self._pipeline = pipeline(
+                            "text-classification",
+                            model=model_id,
+                            truncation=True,
+                            max_length=512,
+                            device=-1,
+                        )
+                        self._tokenizer = AutoTokenizer.from_pretrained(model_id)
+                        self._model = AutoModelForSequenceClassification.from_pretrained(model_id)
+                        self.model_name = model_id
+                        self._loaded = True
+                        logger.info(f"BERT model loaded: {model_id}")
+                        return
+                    except Exception as e:
+                        logger.warning(f"Failed to load {model_id}: {e}")
+                        continue
+                logger.error("All BERT model candidates failed")
+            except Exception as e:
+                logger.error(f"BERT initialization failed: {e}")
+    def tokenize_url(self, url: str) -> str:
+        """
+        Split URL on [-./=?&_~%@:] to preserve homoglyphs.
+        Example: "paypa1-l0gin.xyz/verify" → "paypa1 l0gin xyz verify"
+        """
+        text = url.replace("https://", "").replace("http://", "")
+        tokens = self._re_url_split.split(text)
+        return " ".join(t for t in tokens if t)
+    def predict(self, url: str, title: str = "", snippet: str = "") -> float:
+        """
+        Predict phishing probability for a URL + page context.
+        Returns P_bert ∈ [0,1].
+        """
+        self.load_model()
+        if self._loaded and self._pipeline is not None:
+            return self._predict_bert(url, title, snippet)
+        return self._predict_keyword(url, title, snippet)
+    def _predict_bert(self, url: str, title: str, snippet: str) -> float:
+        """BERT model prediction path."""
+        url_text = self.tokenize_url(url)
+        combined = f"URL: {url_text}. Title: {title}. Content: {snippet[:300]}"
+        result = self._pipeline(combined[:512])[0]
+        label = result["label"].upper()
+        confidence = result["score"]
+        # Map label to phishing probability
+        if any(kw in label for kw in ["SPAM", "PHISH", "MALICIOUS", "LABEL_1", "1"]):
+            raw_prob = confidence
+        else:
+            raw_prob = 1.0 - confidence
+        # Boost with keyword signals
+        text_lower = combined.lower()
+        phrase_hits = sum(1 for p in PHISHING_TERMS if p in text_lower)
+        adjusted = min(raw_prob + (phrase_hits * 0.05), 1.0)
+        return round(adjusted, 4)
+    def _predict_keyword(self, url: str, title: str, snippet: str) -> float:
+        """Keyword-based fallback when BERT is unavailable."""
+        combined = f"{url} {title} {snippet}".lower()
+        url_lower = url.lower()
+        score = 0.0
+        # Keyword hits in URL
+        kw_hits = sum(1 for kw in PHISHING_KEYWORDS if kw in url_lower)
+        score += min(kw_hits * 0.08, 0.40)
+        # Phrase matches in content
+        phrase_hits = sum(1 for p in PHISHING_TERMS if p in combined)
+        score += min(phrase_hits * 0.12, 0.48)
+        # Brand spoofing
+        for brand in BRAND_NAMES:
+            if brand in url_lower:
+                if f"{brand}.com" not in url_lower:
+                    score += 0.20
+                    break
+        # IP as hostname
+        if re.match(r"https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", url):
+            score += 0.20
+        # Shannon entropy of hostname
+        try:
+            from urllib.parse import urlparse
+            host = urlparse(url if "://" in url else f"http://{url}").hostname or ""
+            if host:
+                length = len(host)
+                freq: Dict[str, int] = {}
+                for c in host:
+                    freq[c] = freq.get(c, 0) + 1
+                entropy = -sum(
+                    (cnt / length) * math.log2(cnt / length) for cnt in freq.values()
+                )
+                if entropy > 3.5:
+                    score += 0.10
+        except Exception:
+            pass
+        return round(min(score, 1.0), 4)
+    def incremental_update(
+        self,
+        samples: List[Tuple[str, int]],
+        lr: float = 1e-5,
+        epochs: int = 1,
+        label_smoothing: float = 0.1,
+    ) -> Optional[float]:
+        """
+        Incremental update: unfreeze last 2 transformer layers only.
+        Returns accuracy_delta (float) or None if update failed.
+        samples: list of (url, label) where label is 0 or 1
+        """
+        if not self._loaded or self._model is None or self._tokenizer is None:
+            logger.warning("BERT not loaded, cannot incrementally update")
+            return None
+        if len(samples) < 5:
+            logger.warning(f"Too few samples ({len(samples)}) for BERT update")
+            return None
+        try:
+            import torch
+            from torch.utils.data import DataLoader, TensorDataset
+            from torch.optim import AdamW
+            device = torch.device("cpu")
+            model = self._model.to(device)
+            # Freeze all layers
+            for param in model.parameters():
+                param.requires_grad = False
+            # Unfreeze last 2 transformer layers + classifier
+            if hasattr(model, "bert"):
+                encoder_layers = model.bert.encoder.layer
+                for layer in encoder_layers[-2:]:
+                    for param in layer.parameters():
+                        param.requires_grad = True
+            if hasattr(model, "classifier"):
+                for param in model.classifier.parameters():
+                    param.requires_grad = True
+            # Prepare data
+            texts = [self.tokenize_url(url) for url, _ in samples]
+            labels = [label for _, label in samples]
+            encodings = self._tokenizer(
+                texts, truncation=True, padding=True, max_length=512,
+                return_tensors="pt"
+            )
+            label_tensor = torch.tensor(labels, dtype=torch.long).to(device)
+            dataset = TensorDataset(
+                encodings["input_ids"].to(device),
+                encodings["attention_mask"].to(device),
+                label_tensor,
+            )
+            batch_size = min(len(samples), 16)
+            loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+            # Pre-update accuracy
+            model.eval()
+            with torch.no_grad():
+                pre_correct = 0
+                for batch in loader:
+                    ids, mask, labs = batch
+                    outputs = model(input_ids=ids, attention_mask=mask)
+                    preds = torch.argmax(outputs.logits, dim=1)
+                    pre_correct += (preds == labs).sum().item()
+                pre_acc = pre_correct / len(samples)
+            # Train
+            optimizer = AdamW(
+                filter(lambda p: p.requires_grad, model.parameters()),
+                lr=lr,
+            )
+            loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=label_smoothing)
+            model.train()
+            for epoch in range(epochs):
+                total_loss = 0.0
+                for batch in loader:
+                    ids, mask, labs = batch
+                    optimizer.zero_grad()
+                    outputs = model(input_ids=ids, attention_mask=mask)
+                    loss = loss_fn(outputs.logits, labs)
+                    loss.backward()
+                    optimizer.step()
+                    total_loss += loss.item()
+                logger.info(f"BERT incremental epoch {epoch+1}/{epochs}, loss={total_loss/len(loader):.4f}")
+            # Post-update accuracy
+            model.eval()
+            with torch.no_grad():
+                post_correct = 0
+                for batch in loader:
+                    ids, mask, labs = batch
+                    outputs = model(input_ids=ids, attention_mask=mask)
+                    preds = torch.argmax(outputs.logits, dim=1)
+                    post_correct += (preds == labs).sum().item()
+                post_acc = post_correct / len(samples)
+            delta = post_acc - pre_acc
+            self._model = model
+            logger.info(f"BERT incremental update: {pre_acc:.4f} → {post_acc:.4f} (Δ={delta:+.4f})")
+            return round(delta, 4)
+        except Exception as e:
+            logger.error(f"BERT incremental update failed: {e}")
+            return None
+    def save(self, path: Path) -> None:
+        """Save model and tokenizer to directory."""
+        if self._model and self._tokenizer:
+            path = Path(path)
+            path.mkdir(parents=True, exist_ok=True)
+            self._model.save_pretrained(str(path))
+            self._tokenizer.save_pretrained(str(path))
+            logger.info(f"BERT model saved to {path}")
+    def load_local(self, path: Path) -> bool:
+        """Load model from local directory."""
+        path = Path(path)
+        if not path.exists():
+            return False
+        try:
+            from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+            self._tokenizer = AutoTokenizer.from_pretrained(str(path))
+            self._model = AutoModelForSequenceClassification.from_pretrained(str(path))
+            self._pipeline = pipeline(
+                "text-classification",
+                model=self._model,
+                tokenizer=self._tokenizer,
+                truncation=True,
+                max_length=512,
+                device=-1,
+            )
+            self._loaded = True
+            logger.info(f"BERT model loaded from {path}")
+            return True
+        except Exception as e:
+            logger.error(f"BERT local load failed: {e}")
+            return False
+    @property
+    def is_loaded(self) -> bool:
+        return self._loaded
+# ── Legacy compatibility ─────────────────────────────────────────────
+_default_classifier = BERTPhishingClassifier()
+def analyze_text(url: str, page_title: str = "", page_snippet: str = "") -> dict:
+    """Legacy wrapper for backward compatibility with main.py."""
+    prob = _default_classifier.predict(url, page_title, page_snippet)
+    return {
+        "bert_phishing_prob": prob,
+        "phrase_hits": 0,
+        "label": "BERT" if _default_classifier.is_loaded else "KEYWORD_NLP",
+        "confidence": prob,
+    }
+def shannon_entropy(s: str) -> float:
+    """Utility: measure randomness of a string."""
+    if not s:
+        return 0.0
+    prob = [s.count(c) / len(s) for c in set(s)]
+    return -sum(p * math.log2(p) for p in prob if p > 0)

bert_finetune.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# ============================================================
+# PhishGuard AI - bert_finetune.py
+# Full BERT fine-tuning script on PhishTank + TRANCO data
+#
+# Downloads data, fine-tunes ealvaradob/bert-finetuned-phishing
+# 3 epochs, AdamW + linear warmup scheduler
+# Saves to bert_weights/ with save_pretrained()
+# Prints per-epoch: loss / precision / recall / F1
+# ============================================================
+from __future__ import annotations
+import logging
+import sys
+from pathlib import Path
+from typing import List, Tuple
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)-7s | %(message)s",
+)
+logger = logging.getLogger("phishguard.bert_finetune")
+BASE_DIR = Path(__file__).parent
+BERT_WEIGHTS_DIR = BASE_DIR / "bert_weights"
+def main() -> None:
+    """Fine-tune BERT on PhishTank + TRANCO URLs."""
+    print("=" * 60)
+    print("PhishGuard AI — BERT Fine-Tuning")
+    print("=" * 60)
+    # ── Check dependencies ───────────────────────────────────────
+    try:
+        import torch
+        from torch.utils.data import DataLoader, Dataset
+        from torch.optim import AdamW
+        from transformers import (
+            AutoTokenizer,
+            AutoModelForSequenceClassification,
+            get_linear_schedule_with_warmup,
+        )
+        from sklearn.metrics import precision_recall_fscore_support
+    except ImportError as e:
+        print(f"❌ Missing dependency: {e}")
+        print("   Run: pip install torch transformers scikit-learn")
+        sys.exit(1)
+    # ── Download data ────────────────────────────────────────────
+    from data_collector import download_phishtank, download_tranco, merge_datasets
+    print("\n📥 Downloading datasets...")
+    phish_urls = download_phishtank(max_urls=50)
+    legit_urls = download_tranco(n=50)
+    print(f"   Phishing URLs: {len(phish_urls)}")
+    print(f"   Legitimate URLs: {len(legit_urls)}")
+    train_data, val_data, test_data = merge_datasets(phish_urls, legit_urls)
+    # ── URL tokenization ─────────────────────────────────────────
+    import re
+    _re_url_split = re.compile(r"[-./=?&_~%@:]+")
+    def tokenize_url(url: str) -> str:
+        text = url.replace("https://", "").replace("http://", "")
+        tokens = _re_url_split.split(text)
+        return " ".join(t for t in tokens if t)
+    # ── Dataset class ────────────────────────────────────────────
+    class PhishingURLDataset(Dataset):
+        def __init__(self, data: List[Tuple[str, int]], tokenizer, max_length: int = 512):
+            self.data = data
+            self.tokenizer = tokenizer
+            self.max_length = max_length
+        def __len__(self) -> int:
+            return len(self.data)
+        def __getitem__(self, idx: int):
+            url, label = self.data[idx]
+            text = f"URL: {tokenize_url(url)}"
+            encoding = self.tokenizer(
+                text,
+                truncation=True,
+                padding="max_length",
+                max_length=self.max_length,
+                return_tensors="pt",
+            )
+            return {
+                "input_ids": encoding["input_ids"].squeeze(0),
+                "attention_mask": encoding["attention_mask"].squeeze(0),
+                "labels": torch.tensor(label, dtype=torch.long),
+            }
+    # ── Load model ───────────────────────────────────────────────
+    MODEL_NAME = "ealvaradob/bert-finetuned-phishing"
+    FALLBACK = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
+    print("\n🤖 Loading BERT model...")
+    tokenizer = None
+    model = None
+    for model_id in [MODEL_NAME, FALLBACK]:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            model = AutoModelForSequenceClassification.from_pretrained(
+                model_id, num_labels=2
+            )
+            print(f"   ✅ Loaded: {model_id}")
+            break
+        except Exception as e:
+            print(f"   ⚠️  {model_id} failed: {e}")
+            continue
+    if model is None or tokenizer is None:
+        print("❌ Could not load any BERT model. Exiting.")
+        sys.exit(1)
+    # ── Prepare data ─────────────────────────────────────────────
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"   Device: {device}")
+    train_dataset = PhishingURLDataset(train_data, tokenizer)
+    val_dataset = PhishingURLDataset(val_data, tokenizer)
+    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=32)
+    model = model.to(device)
+    # ── Optimizer + Scheduler ────────────────────────────────────
+    EPOCHS = 1
+    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
+    total_steps = len(train_loader) * EPOCHS
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=total_steps // 10,
+        num_training_steps=total_steps,
+    )
+    # ── Training Loop ────────────────────────────────────────────
+    print(f"\n🏋️ Training for {EPOCHS} epochs...")
+    print(f"   Train batches: {len(train_loader)}")
+    print(f"   Val batches: {len(val_loader)}")
+    best_f1 = 0.0
+    for epoch in range(1, EPOCHS + 1):
+        # Train
+        model.train()
+        total_loss = 0.0
+        train_preds = []
+        train_labels = []
+        for batch_idx, batch in enumerate(train_loader):
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["labels"].to(device)
+            optimizer.zero_grad()
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+            loss = outputs.loss
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+            optimizer.step()
+            scheduler.step()
+            total_loss += loss.item()
+            preds = torch.argmax(outputs.logits, dim=1)
+            train_preds.extend(preds.cpu().tolist())
+            train_labels.extend(labels.cpu().tolist())
+            if (batch_idx + 1) % 50 == 0:
+                print(f"   Epoch {epoch} | Batch {batch_idx+1}/{len(train_loader)} | Loss: {loss.item():.4f}")
+        avg_loss = total_loss / len(train_loader)
+        # Validate
+        model.eval()
+        val_preds = []
+        val_labels = []
+        with torch.no_grad():
+            for batch in val_loader:
+                input_ids = batch["input_ids"].to(device)
+                attention_mask = batch["attention_mask"].to(device)
+                labels = batch["labels"].to(device)
+                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+                preds = torch.argmax(outputs.logits, dim=1)
+                val_preds.extend(preds.cpu().tolist())
+                val_labels.extend(labels.cpu().tolist())
+        precision, recall, f1, _ = precision_recall_fscore_support(
+            val_labels, val_preds, average="binary", zero_division=0
+        )
+        print(f"\n   📊 Epoch {epoch}/{EPOCHS}:")
+        print(f"      Loss:      {avg_loss:.4f}")
+        print(f"      Precision: {precision:.4f}")
+        print(f"      Recall:    {recall:.4f}")
+        print(f"      F1 Score:  {f1:.4f}")
+        # Save best model
+        if f1 > best_f1:
+            best_f1 = f1
+            BERT_WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
+            model.save_pretrained(str(BERT_WEIGHTS_DIR))
+            tokenizer.save_pretrained(str(BERT_WEIGHTS_DIR))
+            print(f"      ✅ New best model saved to {BERT_WEIGHTS_DIR}")
+    print(f"\n🎯 Best F1: {best_f1:.4f}")
+    print(f"✅ Fine-tuning complete. Weights saved to: {BERT_WEIGHTS_DIR}")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

cnn_inference.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# ============================================================
+# PhishGuard AI - cnn/cnn_inference.py
+# CNN inference wrapper for Tier 4 visual analysis.
+# Supports: predict, hot-reload, incremental_update.
+# ============================================================
+from __future__ import annotations
+import io
+import random
+import logging
+from pathlib import Path
+from typing import List, Optional, Tuple
+import torch
+from PIL import Image
+logger = logging.getLogger("phishguard.cnn.inference")
+CNN_DIR = Path(__file__).parent
+BACKEND_DIR = CNN_DIR.parent
+WEIGHTS_PATH = CNN_DIR / "cnn_weights.pt"
+REPLAY_BUFFER_PATH = BACKEND_DIR / "data" / "cnn_replay_buffer.pt"
+class CNNInference:
+    """CNN inference wrapper with hot-reload and incremental update."""
+    def __init__(self, weights_path: Optional[Path] = None) -> None:
+        self._weights_path = weights_path or WEIGHTS_PATH
+        self._model = None
+        self._loaded = False
+    def load(self, weights_path: Optional[Path] = None) -> bool:
+        """Load CNN model."""
+        from cnn_model import load_cnn
+        path = weights_path or self._weights_path
+        self._model = load_cnn(str(path) if path.exists() else None)
+        self._loaded = self._model is not None
+        return self._loaded
+    def predict(self, screenshot_bytes: bytes) -> float:
+        """
+        Predict phishing probability from screenshot bytes.
+        Returns P_cnn ∈ [0,1].
+        """
+        if not self._loaded:
+            self.load()
+        if self._model is None:
+            return 0.5
+        from cnn_model import preprocess_screenshot
+        try:
+            tensor = preprocess_screenshot(screenshot_bytes)
+            return self._model.predict_proba(tensor)
+        except Exception as e:
+            logger.error(f"CNN predict failed: {e}")
+            return 0.5
+    def reload(self, weights_path: Optional[Path] = None) -> bool:
+        """Hot-reload model with new weights."""
+        from cnn_model import load_cnn
+        path = weights_path or self._weights_path
+        new_model = load_cnn(str(path))
+        if new_model is not None:
+            self._model = new_model
+            self._loaded = True
+            logger.info(f"CNN hot-reloaded from {path}")
+            return True
+        return False
+    async def incremental_update(
+        self,
+        tier4_samples: List[Tuple[str, int]],
+        replay_buffer_path: Optional[Path] = None,
+        lr: float = 1e-4,
+        epochs: int = 3,
+    ) -> Optional[float]:
+        """
+        Incremental update on Tier 4 feedback samples.
+        Re-captures screenshots via Playwright, trains on them + replay buffer.
+        Returns accuracy_delta or None if no Tier 4 samples.
+        """
+        if not tier4_samples:
+            logger.info("No Tier 4 samples — skipping CNN update")
+            return None
+        if self._model is None:
+            logger.warning("CNN not loaded, cannot update")
+            return None
+        try:
+            import torch.nn as nn
+            from torch.optim import AdamW
+            from torch.utils.data import DataLoader, TensorDataset
+            import torchvision.transforms as T
+            device = torch.device("cpu")
+            model = self._model.to(device)
+            transform = T.Compose([
+                T.Resize((224, 224)),
+                T.ToTensor(),
+                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ])
+            # Try to capture screenshots for the new samples
+            tensors = []
+            labels = []
+            for url, label in tier4_samples:
+                try:
+                    # Try to capture screenshot
+                    screenshot_bytes = await self._capture_screenshot(url)
+                    if screenshot_bytes:
+                        img = Image.open(io.BytesIO(screenshot_bytes)).convert("RGB")
+                        tensor = transform(img)
+                        tensors.append(tensor)
+                        labels.append(float(label))
+                except Exception as e:
+                    logger.warning(f"Screenshot capture failed for {url}: {e}")
+                    continue
+            # Load replay buffer (20% mix)
+            buf_path = replay_buffer_path or REPLAY_BUFFER_PATH
+            if buf_path.exists():
+                try:
+                    buf_data = torch.load(buf_path, map_location="cpu", weights_only=False)
+                    buf_paths = buf_data.get("paths", [])
+                    buf_labels = buf_data.get("labels", [])
+                    replay_count = max(1, len(buf_paths) // 5)
+                    indices = random.sample(range(len(buf_paths)), min(replay_count, len(buf_paths)))
+                    for idx in indices:
+                        try:
+                            img = Image.open(buf_paths[idx]).convert("RGB")
+                            tensor = transform(img)
+                            tensors.append(tensor)
+                            labels.append(float(buf_labels[idx]))
+                        except Exception:
+                            continue
+                except Exception as e:
+                    logger.warning(f"CNN replay buffer load failed: {e}")
+            if len(tensors) < 5:
+                logger.warning(f"Too few CNN samples ({len(tensors)}), skipping update")
+                return None
+            # Stack and create dataset
+            x_data = torch.stack(tensors)
+            y_data = torch.tensor(labels, dtype=torch.float)
+            dataset = TensorDataset(x_data, y_data)
+            loader = DataLoader(dataset, batch_size=8, shuffle=True)
+            # Pre-update accuracy
+            model.eval()
+            pre_correct = 0
+            with torch.no_grad():
+                for bx, by in loader:
+                    bx, by = bx.to(device), by.to(device)
+                    out = model(bx).squeeze()
+                    preds = (out >= 0.5).float()
+                    pre_correct += (preds == by).sum().item()
+            pre_acc = pre_correct / len(dataset)
+            # Train (head only — backbone stays frozen)
+            head_params = [p for p in model.backbone.fc.parameters() if p.requires_grad]
+            optimizer = AdamW(head_params, lr=lr)
+            loss_fn = nn.BCELoss()
+            model.train()
+            for epoch in range(epochs):
+                total_loss = 0.0
+                for bx, by in loader:
+                    bx, by = bx.to(device), by.to(device)
+                    optimizer.zero_grad()
+                    out = model(bx).squeeze()
+                    loss = loss_fn(out, by)
+                    loss.backward()
+                    optimizer.step()
+                    total_loss += loss.item()
+                logger.info(f"CNN incremental epoch {epoch+1}/{epochs}, loss={total_loss/len(loader):.4f}")
+            # Post-update accuracy
+            model.eval()
+            post_correct = 0
+            with torch.no_grad():
+                for bx, by in loader:
+                    bx, by = bx.to(device), by.to(device)
+                    out = model(bx).squeeze()
+                    preds = (out >= 0.5).float()
+                    post_correct += (preds == by).sum().item()
+            post_acc = post_correct / len(dataset)
+            delta = post_acc - pre_acc
+            self._model = model
+            # Save weights
+            torch.save(model.state_dict(), self._weights_path)
+            logger.info(f"CNN incremental: {pre_acc:.4f} → {post_acc:.4f} (Δ={delta:+.4f})")
+            return round(delta, 4)
+        except Exception as e:
+            logger.error(f"CNN incremental update failed: {e}")
+            return None
+    async def _capture_screenshot(self, url: str) -> Optional[bytes]:
+        """Capture a screenshot of a URL using Playwright."""
+        try:
+            from playwright.async_api import async_playwright
+            async with async_playwright() as p:
+                browser = await p.chromium.launch(headless=True)
+                page = await browser.new_page(viewport={"width": 1280, "height": 800})
+                # Block heavy resources
+                await page.route("**/*.{png,jpg,jpeg,gif,svg,mp4,webm,ogg,woff,woff2,ttf,eot}",
+                                 lambda route: route.abort())
+                await page.goto(url, wait_until="domcontentloaded", timeout=10000)
+                screenshot = await page.screenshot(type="png")
+                await browser.close()
+                return screenshot
+        except Exception as e:
+            logger.warning(f"Screenshot capture failed: {e}")
+            return None
+    @property
+    def is_loaded(self) -> bool:
+        return self._loaded

cnn_model.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# ============================================================
+# PhishGuard AI - cnn/cnn_model.py
+# ResNet50 visual classifier for phishing screenshot detection.
+#
+# Architecture (from spec):
+#   Backbone: ResNet50 fully frozen
+#   Custom head: Linear(2048→512) → ReLU → Dropout(0.5) →
+#                Linear(512→1) → Sigmoid
+#   Input: 224×224 screenshot tensor
+#   Output: P_cnn ∈ [0,1]
+# ============================================================
+from __future__ import annotations
+import io
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torchvision.transforms as T
+from PIL import Image
+logger = logging.getLogger("phishguard.cnn.model")
+class PhishCNN(nn.Module):
+    """
+    ResNet50 with frozen backbone and custom 2-layer binary classification head.
+    Output: P_cnn ∈ [0,1] via sigmoid.
+    """
+    def __init__(self, pretrained: bool = True) -> None:
+        super().__init__()
+        # Load pretrained ResNet50 backbone
+        if pretrained:
+            self.backbone = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
+        else:
+            self.backbone = models.resnet50(weights=None)
+        # Freeze entire backbone
+        for param in self.backbone.parameters():
+            param.requires_grad = False
+        # Replace fc with custom head: 2048 → 512 → 1 → sigmoid
+        in_features = self.backbone.fc.in_features  # 2048
+        self.backbone.fc = nn.Sequential(
+            nn.Linear(in_features, 512),
+            nn.ReLU(),
+            nn.Dropout(0.5),
+            nn.Linear(512, 1),
+        )
+        # Ensure custom head is trainable
+        for param in self.backbone.fc.parameters():
+            param.requires_grad = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass.
+        Input: (batch, 3, 224, 224)
+        Output: (batch, 1) probabilities in [0, 1]
+        """
+        logits = self.backbone(x)
+        return torch.sigmoid(logits)
+    def predict_proba(self, x: torch.Tensor) -> float:
+        """Return P_cnn ∈ [0,1] — probability of phishing."""
+        self.eval()
+        with torch.no_grad():
+            output = self.forward(x)
+            return output.squeeze().item()
+# ── Preprocessing pipeline (matches ImageNet normalization) ──────────
+TRANSFORM = T.Compose([
+    T.Resize((224, 224)),
+    T.ToTensor(),
+    T.Normalize(
+        mean=[0.485, 0.456, 0.406],  # ImageNet mean
+        std=[0.229, 0.224, 0.225],   # ImageNet std
+    ),
+])
+# Training augmentation transforms
+TRAIN_TRANSFORM = T.Compose([
+    T.Resize((224, 224)),
+    T.RandomHorizontalFlip(),
+    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
+    T.RandomRotation(5),
+    T.ToTensor(),
+    T.Normalize(
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225],
+    ),
+])
+def preprocess_screenshot(screenshot_bytes: bytes) -> torch.Tensor:
+    """Convert raw screenshot bytes → model-ready tensor [1, 3, 224, 224]."""
+    img = Image.open(io.BytesIO(screenshot_bytes)).convert("RGB")
+    return TRANSFORM(img).unsqueeze(0)
+def load_cnn(weights_path: Optional[str] = None) -> PhishCNN:
+    """Load CNN model with optional trained weights."""
+    model = PhishCNN(pretrained=True)
+    if weights_path:
+        try:
+            state = torch.load(weights_path, map_location="cpu", weights_only=True)
+            model.load_state_dict(state)
+            logger.info(f"CNN weights loaded from {weights_path}")
+        except Exception as e:
+            logger.warning(f"Could not load CNN weights: {e}")
+            logger.info("Using ImageNet features only (baseline)")
+    model.eval()
+    return model

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_name_or_path": "ealvaradob/bert-finetuned-phishing",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "benign",
+    "1": "phishing"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "benign": 0,
+    "phishing": 1
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

content.js ADDED Viewed

	@@ -0,0 +1,180 @@

+// ============================================================
+// PhishGuard AI - content.js
+// Content script: runs inside every page.
+// Detects phishing signals and injects feedback banner.
+// ============================================================
+(function() {
+  "use strict";
+  // ── Page Signal Detection ──────────────────────────────────────
+  function detectPageSignals() {
+    const signals = [];
+    const title = document.title || "";
+    const bodyText = (document.body?.innerText || "").substring(0, 2000).toLowerCase();
+    const url = window.location.href.toLowerCase();
+    // 1. Password form posting to external domain
+    const forms = document.querySelectorAll("form");
+    forms.forEach(form => {
+      const hasPassword = form.querySelector('input[type="password"]');
+      const action = (form.getAttribute("action") || "").toLowerCase();
+      if (hasPassword && action.startsWith("http") && !action.includes(window.location.hostname)) {
+        signals.push("password_form_external_action");
+      }
+    });
+    // 2. Brand name in title mismatching hostname
+    const brands = ["paypal","google","apple","microsoft","amazon","netflix",
+      "facebook","instagram","chase","wellsfargo","bankofamerica"];
+    const hostname = window.location.hostname.toLowerCase();
+    for (const brand of brands) {
+      if (title.toLowerCase().includes(brand) && !hostname.includes(brand)) {
+        signals.push(`brand_mismatch:${brand}`);
+      }
+    }
+    // 3. Urgency language
+    const urgencyPhrases = [
+      "your account has been", "verify immediately", "suspended",
+      "unusual activity", "click here now", "act now",
+      "confirm your identity", "limited time", "expires soon"
+    ];
+    for (const phrase of urgencyPhrases) {
+      if (bodyText.includes(phrase)) {
+        signals.push("urgency_language");
+        break;
+      }
+    }
+    // 4. Hidden iframes
+    const iframes = document.querySelectorAll("iframe");
+    iframes.forEach(iframe => {
+      const style = window.getComputedStyle(iframe);
+      const w = parseInt(style.width) || iframe.width;
+      const h = parseInt(style.height) || iframe.height;
+      if (style.display === "none" || style.visibility === "hidden" ||
+          (w <= 1 && h <= 1)) {
+        signals.push("hidden_iframe");
+      }
+    });
+    return {
+      title,
+      snippet: bodyText.substring(0, 500),
+      signals,
+    };
+  }
+  // Send signals to background.js
+  try {
+    const pageData = detectPageSignals();
+    chrome.runtime.sendMessage({
+      type: "page_signals",
+      url: window.location.href,
+      ...pageData,
+    });
+  } catch (e) {
+    // Extension context may be invalidated
+  }
+  // ── Feedback Banner Injection ──────────────────────────────────
+  // Listen for messages from background.js to inject banner
+  chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => {
+    if (msg.type === "inject_feedback_banner") {
+      injectFeedbackBanner(msg.verdict, msg.confidence, msg.urlHash, msg.tier);
+      sendResponse({ success: true });
+    }
+  });
+  function injectFeedbackBanner(verdict, confidence, urlHash, tier) {
+    // Don't inject if already present
+    if (document.getElementById("phishguard-feedback-banner")) return;
+    const isPhishing = verdict === "phishing";
+    const confPct = Math.round(confidence * 100);
+    const tierText = `Tier ${tier}`;
+    const banner = document.createElement("div");
+    banner.id = "phishguard-feedback-banner";
+    banner.style.cssText = `
+      position: fixed; top: 0; left: 0; right: 0; z-index: 2147483647;
+      background: ${isPhishing ? "linear-gradient(135deg, #1a0000, #3a0000)" : "linear-gradient(135deg, #001a00, #003a00)"};
+      color: white; padding: 10px 20px;
+      display: flex; align-items: center; gap: 12px;
+      font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+      font-size: 14px; box-shadow: 0 4px 20px rgba(0,0,0,0.5);
+      border-bottom: 2px solid ${isPhishing ? "#ef4444" : "#22c55e"};
+    `;
+    const icon = isPhishing ? "🛡️" : "✅";
+    const statusText = isPhishing ? "PhishGuard flagged this page" : "PhishGuard: Page looks safe";
+    banner.innerHTML = `
+      <span style="font-size: 18px">${icon}</span>
+      <span style="flex: 1">${statusText} · ${confPct}% · ${tierText}</span>
+      <button id="pg-correct" style="
+        background: rgba(34,197,94,0.2); border: 1px solid #22c55e; color: #22c55e;
+        padding: 6px 14px; border-radius: 6px; cursor: pointer; font-size: 13px;
+        font-weight: 600; transition: all 0.2s;
+      ">👍 Correct</button>
+      <button id="pg-wrong" style="
+        background: rgba(239,68,68,0.2); border: 1px solid #ef4444; color: #ef4444;
+        padding: 6px 14px; border-radius: 6px; cursor: pointer; font-size: 13px;
+        font-weight: 600; transition: all 0.2s;
+      ">👎 Wrong</button>
+      ${isPhishing ? `<button id="pg-proceed" style="
+        background: transparent; border: 1px solid rgba(255,255,255,0.2); color: #999;
+        padding: 6px 14px; border-radius: 6px; cursor: pointer; font-size: 12px;
+        transition: all 0.2s;
+      ">Proceed Anyway</button>` : ""}
+      <button id="pg-close" style="
+        background: none; border: none; color: #666; cursor: pointer;
+        font-size: 18px; padding: 0 4px;
+      ">×</button>
+    `;
+    document.body.prepend(banner);
+    document.body.style.marginTop = (banner.offsetHeight) + "px";
+    // Button handlers
+    document.getElementById("pg-correct")?.addEventListener("click", () => {
+      submitBannerFeedback(urlHash, "correct", banner);
+    });
+    document.getElementById("pg-wrong")?.addEventListener("click", () => {
+      submitBannerFeedback(urlHash, "incorrect", banner);
+    });
+    document.getElementById("pg-proceed")?.addEventListener("click", () => {
+      chrome.runtime.sendMessage({ type: "whitelist_url", url: window.location.href });
+      removeBanner(banner);
+    });
+    document.getElementById("pg-close")?.addEventListener("click", () => {
+      removeBanner(banner);
+    });
+  }
+  function submitBannerFeedback(urlHash, feedback, banner) {
+    chrome.runtime.sendMessage({
+      type: "submit_feedback",
+      url_hash: urlHash,
+      feedback: feedback,
+    }, (response) => {
+      if (response?.success) {
+        banner.innerHTML = `
+          <span style="font-size: 18px">✅</span>
+          <span style="flex: 1; color: #22c55e">Thanks! Your feedback helps improve PhishGuard</span>
+        `;
+        setTimeout(() => removeBanner(banner), 3000);
+      }
+    });
+  }
+  function removeBanner(banner) {
+    document.body.style.marginTop = "";
+    banner.remove();
+  }
+})();

data_collector.py ADDED Viewed

	@@ -0,0 +1,364 @@

+# ============================================================
+# PhishGuard AI - data_collector.py
+# Downloads all training data from public HTTP endpoints.
+# No API keys required.
+#
+# Datasets:
+#   1. PhishTank (bz2 JSON → phishing URLs)
+#   2. TRANCO Top-10K (zip CSV → legitimate domains)
+#   3. Kaggle GitHub mirror (CSV → pre-extracted features)
+# ============================================================
+from __future__ import annotations
+import bz2
+import csv
+import io
+import json
+import zipfile
+import hashlib
+import logging
+from pathlib import Path
+from typing import List, Tuple, Optional
+import requests
+import pandas as pd
+from sklearn.model_selection import train_test_split
+logger = logging.getLogger("phishguard.data_collector")
+# ── Data directory ────────────────────────────────────────────────────
+DATA_DIR = Path(__file__).parent / "data"
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+# ── Public URLs (no API keys) ────────────────────────────────────────
+PHISHTANK_URL = "http://data.phishtank.com/data/online-valid.json.bz2"
+TRANCO_URL = "https://tranco-list.eu/top-1m.csv.zip"
+KAGGLE_PRIMARY = "https://raw.githubusercontent.com/GregaVrbancic/Phishing-Dataset/master/dataset_full.csv"
+KAGGLE_BACKUP = "https://raw.githubusercontent.com/datasets/phishing-websites/master/data.csv"
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                  "AppleWebKit/537.36 (KHTML, like Gecko) "
+                  "Chrome/120.0.0.0 Safari/537.36"
+}
+def download_phishtank(max_urls: int = 30000) -> List[str]:
+    """
+    Download phishing URLs from PhishTank public feed.
+    Fetches bz2 → decompresses → parses JSON → filters verified+online.
+    Returns list of verified phishing URLs (up to max_urls).
+    """
+    logger.info("Downloading PhishTank data...")
+    phish_cache = DATA_DIR / "phishing_urls.txt"
+    # Use cache if recent
+    if phish_cache.exists() and phish_cache.stat().st_size > 1000:
+        urls = phish_cache.read_text().strip().splitlines()
+        if len(urls) >= 100:
+            logger.info(f"Using cached PhishTank data: {len(urls)} URLs")
+            return urls[:max_urls]
+    try:
+        resp = requests.get(PHISHTANK_URL, headers=HEADERS, timeout=120, stream=True)
+        resp.raise_for_status()
+        # Decompress bz2
+        raw_data = bz2.decompress(resp.content)
+        records = json.loads(raw_data)
+        # Filter: verified=True AND online (verification_time present)
+        urls: List[str] = []
+        for record in records:
+            if not isinstance(record, dict):
+                continue
+            url = record.get("url", "").strip()
+            verified = record.get("verified", "no")
+            online = record.get("online", "no")
+            is_verified = verified in (True, "yes", "true", "True", "1", 1)
+            is_online = online in (True, "yes", "true", "True", "1", 1)
+            if url and is_verified and is_online:
+                urls.append(url)
+            if len(urls) >= max_urls:
+                break
+        logger.info(f"PhishTank: {len(urls)} verified+online URLs extracted")
+        # Cache to disk
+        phish_cache.write_text("\n".join(urls))
+        return urls
+    except Exception as e:
+        logger.warning(f"PhishTank download failed: {e}")
+        # Fallback: try to use cached data
+        if phish_cache.exists():
+            urls = phish_cache.read_text().strip().splitlines()
+            logger.info(f"Using fallback cached data: {len(urls)} URLs")
+            return urls[:max_urls]
+        # Generate synthetic phishing-like URLs for training
+        logger.warning("Generating synthetic phishing URLs as fallback")
+        return _generate_synthetic_phishing(500)
+def _generate_synthetic_phishing(count: int) -> List[str]:
+    """Generate synthetic phishing URLs for training when real data unavailable."""
+    import random
+    brands = ["paypal", "google", "apple", "microsoft", "amazon", "netflix",
+              "facebook", "chase", "wellsfargo", "bankofamerica"]
+    tlds = [".xyz", ".tk", ".ml", ".ga", ".cf", ".gq", ".pw", ".top", ".click"]
+    keywords = ["login", "verify", "secure", "update", "account", "signin",
+                "reset", "confirm", "suspend", "banking", "alert", "password"]
+    urls: List[str] = []
+    for _ in range(count):
+        brand = random.choice(brands)
+        tld = random.choice(tlds)
+        kw = random.choice(keywords)
+        sep = random.choice(["-", ".", ""])
+        prefix = random.choice(["http://", "https://"])
+        sub = random.choice(["", "www.", "secure.", "login.", "m."])
+        urls.append(f"{prefix}{sub}{brand}{sep}{kw}{tld}/{kw}/index.html")
+    return urls
+def download_tranco(n: int = 10000) -> List[str]:
+    """
+    Download TRANCO Top-1M list, return top-N domains as https:// URLs.
+    Fetches zip → extracts CSV → takes column 2 (domain) → top N rows.
+    """
+    logger.info(f"Downloading TRANCO top-{n} domains...")
+    legit_cache = DATA_DIR / "legitimate_urls.txt"
+    # Use cache if present
+    if legit_cache.exists() and legit_cache.stat().st_size > 1000:
+        urls = legit_cache.read_text().strip().splitlines()
+        if len(urls) >= min(n, 100):
+            logger.info(f"Using cached TRANCO data: {len(urls)} domains")
+            return urls[:n]
+    try:
+        resp = requests.get(TRANCO_URL, headers=HEADERS, timeout=60)
+        resp.raise_for_status()
+        # Extract CSV from zip
+        with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
+            csv_name = zf.namelist()[0]
+            csv_data = zf.read(csv_name).decode("utf-8")
+        # Parse: format is "rank,domain" per line
+        urls: List[str] = []
+        for line in csv_data.strip().splitlines():
+            parts = line.split(",")
+            if len(parts) >= 2:
+                domain = parts[1].strip()
+                if domain:
+                    urls.append(f"https://{domain}")
+            if len(urls) >= n:
+                break
+        logger.info(f"TRANCO: {len(urls)} legitimate domains extracted")
+        # Cache to disk
+        legit_cache.write_text("\n".join(urls))
+        return urls
+    except Exception as e:
+        logger.warning(f"TRANCO download failed: {e}")
+        # Fallback: use cached data or generate synthetic
+        if legit_cache.exists():
+            urls = legit_cache.read_text().strip().splitlines()
+            return urls[:n]
+        logger.warning("Generating synthetic legitimate URLs as fallback")
+        return _generate_synthetic_legitimate(n)
+def _generate_synthetic_legitimate(count: int) -> List[str]:
+    """Generate legitimate-looking URLs as fallback."""
+    top_domains = [
+        "google.com", "youtube.com", "facebook.com", "amazon.com",
+        "wikipedia.org", "twitter.com", "instagram.com", "linkedin.com",
+        "microsoft.com", "apple.com", "github.com", "stackoverflow.com",
+        "reddit.com", "netflix.com", "paypal.com", "yahoo.com", "bing.com",
+        "adobe.com", "dropbox.com", "zoom.us", "slack.com", "spotify.com",
+        "twitch.tv", "ebay.com", "walmart.com", "target.com", "cnn.com",
+        "bbc.com", "nytimes.com", "medium.com",
+    ]
+    urls = [f"https://{d}" for d in top_domains]
+    # Pad with numbered subpages
+    while len(urls) < count:
+        d = top_domains[len(urls) % len(top_domains)]
+        urls.append(f"https://{d}/page/{len(urls)}")
+    return urls[:count]
+def download_kaggle_mirror() -> pd.DataFrame:
+    """
+    Download pre-extracted URL features from Kaggle GitHub mirror.
+    Falls back to backup URL if primary fails.
+    Returns DataFrame with features and CLASS_LABEL column.
+    """
+    logger.info("Downloading Kaggle URL features dataset...")
+    kaggle_cache = DATA_DIR / "kaggle_features.csv"
+    if kaggle_cache.exists() and kaggle_cache.stat().st_size > 1000:
+        logger.info("Using cached Kaggle features")
+        return pd.read_csv(kaggle_cache)
+    for url in [KAGGLE_PRIMARY, KAGGLE_BACKUP]:
+        try:
+            resp = requests.get(url, headers=HEADERS, timeout=60)
+            resp.raise_for_status()
+            df = pd.read_csv(io.StringIO(resp.text))
+            # Standardize label column name
+            label_candidates = ["CLASS_LABEL", "class_label", "Result", "result", "label"]
+            for col in label_candidates:
+                if col in df.columns:
+                    df = df.rename(columns={col: "CLASS_LABEL"})
+                    break
+            if "CLASS_LABEL" not in df.columns:
+                # Try last column
+                df = df.rename(columns={df.columns[-1]: "CLASS_LABEL"})
+            # Normalize labels to 0/1
+            if df["CLASS_LABEL"].dtype == object:
+                df["CLASS_LABEL"] = df["CLASS_LABEL"].map(
+                    {"legitimate": 0, "phishing": 1, "safe": 0}
+                ).fillna(0).astype(int)
+            else:
+                # Handle -1 as legitimate (common in some datasets)
+                df["CLASS_LABEL"] = df["CLASS_LABEL"].apply(
+                    lambda x: 0 if x <= 0 else 1
+                )
+            # Cache
+            df.to_csv(kaggle_cache, index=False)
+            logger.info(f"Kaggle features: {len(df)} rows, {len(df.columns)} columns")
+            return df
+        except Exception as e:
+            logger.warning(f"Kaggle mirror {url} failed: {e}")
+            continue
+    logger.error("All Kaggle mirrors failed")
+    return pd.DataFrame()
+def merge_datasets(
+    phish_urls: List[str],
+    legit_urls: List[str],
+    test_size: float = 0.15,
+    val_size: float = 0.15,
+) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]], List[Tuple[str, int]]]:
+    """
+    Merge phishing + legitimate URLs, return stratified 70/15/15 split.
+    Returns (train, val, test) where each is List[(url, label)].
+    Label: 1 = phishing, 0 = legitimate.
+    """
+    # Deduplicate
+    phish_set = set(phish_urls)
+    legit_set = set(legit_urls) - phish_set  # Ensure no URL in both sets
+    all_data = [(url, 1) for url in phish_set] + [(url, 0) for url in legit_set]
+    urls = [d[0] for d in all_data]
+    labels = [d[1] for d in all_data]
+    # First split: train+val vs test
+    train_val_urls, test_urls, train_val_labels, test_labels = train_test_split(
+        urls, labels,
+        test_size=test_size,
+        stratify=labels,
+        random_state=42,
+    )
+    # Second split: train vs val
+    relative_val = val_size / (1 - test_size)
+    train_urls, val_urls, train_labels, val_labels = train_test_split(
+        train_val_urls, train_val_labels,
+        test_size=relative_val,
+        stratify=train_val_labels,
+        random_state=42,
+    )
+    train = list(zip(train_urls, train_labels))
+    val = list(zip(val_urls, val_labels))
+    test = list(zip(test_urls, test_labels))
+    logger.info(f"Dataset split: train={len(train)}, val={len(val)}, test={len(test)}")
+    return train, val, test
+def save_url_lists(
+    phish_urls: List[str],
+    legit_urls: List[str],
+    phish_path: Optional[Path] = None,
+    legit_path: Optional[Path] = None,
+) -> None:
+    """Save URL lists to text files."""
+    phish_path = phish_path or DATA_DIR / "phishing_urls.txt"
+    legit_path = legit_path or DATA_DIR / "legitimate_urls.txt"
+    phish_path.write_text("\n".join(phish_urls))
+    legit_path.write_text("\n".join(legit_urls))
+    logger.info(f"Saved {len(phish_urls)} phishing URLs to {phish_path}")
+    logger.info(f"Saved {len(legit_urls)} legitimate URLs to {legit_path}")
+def url_hash(url: str) -> str:
+    """SHA256 hash of a URL (for dedup and privacy)."""
+    return hashlib.sha256(url.encode("utf-8")).hexdigest()
+# ── Entry point ──────────────────────────────────────────────────────
+def main() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s | %(levelname)-7s | %(message)s",
+    )
+    print("=" * 60)
+    print("PhishGuard AI — Data Collection")
+    print("=" * 60)
+    # 1. PhishTank
+    phish_urls = download_phishtank()
+    print(f"\n✅ PhishTank: {len(phish_urls)} phishing URLs")
+    # 2. TRANCO
+    legit_urls = download_tranco(n=10000)
+    print(f"✅ TRANCO: {len(legit_urls)} legitimate URLs")
+    # 3. Kaggle features
+    kaggle_df = download_kaggle_mirror()
+    if not kaggle_df.empty:
+        phish_count = (kaggle_df["CLASS_LABEL"] == 1).sum()
+        legit_count = (kaggle_df["CLASS_LABEL"] == 0).sum()
+        print(f"✅ Kaggle: {len(kaggle_df)} rows ({phish_count} phishing, {legit_count} legit)")
+    else:
+        print("⚠️  Kaggle: download failed (will use PhishTank + TRANCO only)")
+    # 4. Save URL lists
+    save_url_lists(phish_urls, legit_urls)
+    # 5. Merge and split
+    train, val, test = merge_datasets(phish_urls, legit_urls)
+    print(f"\n📊 Dataset splits:")
+    print(f"   Train: {len(train)} ({sum(1 for _,l in train if l==1)} phish / {sum(1 for _,l in train if l==0)} legit)")
+    print(f"   Val:   {len(val)} ({sum(1 for _,l in val if l==1)} phish / {sum(1 for _,l in val if l==0)} legit)")
+    print(f"   Test:  {len(test)} ({sum(1 for _,l in test if l==1)} phish / {sum(1 for _,l in test if l==0)} legit)")
+    print(f"\n✅ All data saved to {DATA_DIR}")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

domain_graph_builder.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# ============================================================
+# PhishGuard AI - gnn/domain_graph_builder.py
+# Builds graph representations for GNN inference + training.
+#
+# Node features (12-dim per URL):
+#   [url_len_norm, domain_len_norm, subdomain_count_norm,
+#    shannon_entropy_norm, digit_ratio, hyphen_count_norm,
+#    phishing_keyword_hits_norm, suspicious_tld_binary,
+#    ip_as_hostname_binary, has_https_binary,
+#    path_depth_norm, query_string_len_norm]
+#
+# Edges: shared suspicious TLD + shared IP (async DNS)
+# ============================================================
+from __future__ import annotations
+import re
+import math
+import asyncio
+import logging
+import socket
+from typing import Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+import numpy as np
+logger = logging.getLogger("phishguard.gnn.graph_builder")
+# ── Constants ────────────────────────────────────────────────────────
+SUSPICIOUS_TLDS = frozenset({
+    ".xyz", ".tk", ".ml", ".ga", ".cf",
+    ".gq", ".pw", ".top", ".click",
+})
+PHISHING_KEYWORDS = frozenset({
+    "login", "verify", "secure", "update", "account",
+    "banking", "signin", "reset", "confirm", "suspend",
+    "webscr", "cmd", "payment", "alert",
+})
+_re_ip = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
+class DomainGraphBuilder:
+    """
+    Builds PyTorch Geometric Data objects from URL lists.
+    Each URL becomes a node with 12-dim feature vector.
+    Edges are created from shared IP addresses and shared TLDs.
+    """
+    def __init__(self) -> None:
+        self._re_ip = _re_ip
+    def extract_node_features(self, url: str) -> np.ndarray:
+        """
+        Extract 12-dim feature vector from a URL.
+        Returns np.ndarray of shape (12,) with values in [0, 1].
+        """
+        try:
+            parsed = urlparse(url if "://" in url else f"http://{url}")
+        except Exception:
+            return np.zeros(12, dtype=np.float32)
+        hostname: str = (parsed.hostname or "").lower()
+        path: str = parsed.path or ""
+        query: str = parsed.query or ""
+        scheme: str = parsed.scheme or ""
+        # 1. url_len_norm (normalized by 500)
+        url_len_norm = min(len(url) / 500.0, 1.0)
+        # 2. domain_len_norm (normalized by 100)
+        domain_len_norm = min(len(hostname) / 100.0, 1.0)
+        # 3. subdomain_count_norm
+        parts = hostname.split(".")
+        subdomain_count = max(0, len(parts) - 2)
+        subdomain_count_norm = min(subdomain_count / 10.0, 1.0)
+        # 4. shannon_entropy_norm (normalized by 5.0)
+        entropy = self._shannon_entropy(hostname)
+        shannon_entropy_norm = min(entropy / 5.0, 1.0)
+        # 5. digit_ratio
+        digit_ratio = 0.0
+        if hostname:
+            digits = sum(1 for c in hostname if c.isdigit())
+            digit_ratio = digits / len(hostname)
+        # 6. hyphen_count_norm
+        hyphen_count = hostname.count("-")
+        hyphen_count_norm = min(hyphen_count / 10.0, 1.0)
+        # 7. phishing_keyword_hits_norm
+        url_lower = url.lower()
+        keyword_hits = sum(1 for kw in PHISHING_KEYWORDS if kw in url_lower)
+        phishing_keyword_hits_norm = min(keyword_hits / 5.0, 1.0)
+        # 8. suspicious_tld_binary
+        suspicious_tld_binary = 0.0
+        for tld in SUSPICIOUS_TLDS:
+            if hostname.endswith(tld):
+                suspicious_tld_binary = 1.0
+                break
+        # 9. ip_as_hostname_binary
+        ip_as_hostname_binary = 1.0 if self._re_ip.match(hostname) else 0.0
+        # 10. has_https_binary
+        has_https_binary = 1.0 if scheme == "https" else 0.0
+        # 11. path_depth_norm
+        path_segments = [s for s in path.split("/") if s]
+        path_depth_norm = min(len(path_segments) / 10.0, 1.0)
+        # 12. query_string_len_norm
+        query_string_len_norm = min(len(query) / 500.0, 1.0)
+        features = np.array([
+            url_len_norm,
+            domain_len_norm,
+            subdomain_count_norm,
+            shannon_entropy_norm,
+            digit_ratio,
+            hyphen_count_norm,
+            phishing_keyword_hits_norm,
+            suspicious_tld_binary,
+            ip_as_hostname_binary,
+            has_https_binary,
+            path_depth_norm,
+            query_string_len_norm,
+        ], dtype=np.float32)
+        return features
+    def _shannon_entropy(self, s: str) -> float:
+        """Compute Shannon entropy of a string."""
+        if not s:
+            return 0.0
+        length = len(s)
+        freq: Dict[str, int] = {}
+        for c in s:
+            freq[c] = freq.get(c, 0) + 1
+        return -sum(
+            (count / length) * math.log2(count / length)
+            for count in freq.values()
+            if count > 0
+        )
+    async def _resolve_ips(self, domains: List[str]) -> Dict[str, str]:
+        """
+        Async DNS resolution for a list of domains.
+        Returns dict mapping domain → IP address.
+        """
+        results: Dict[str, str] = {}
+        loop = asyncio.get_event_loop()
+        async def resolve_one(domain: str) -> Tuple[str, str]:
+            try:
+                ip = await asyncio.wait_for(
+                    loop.run_in_executor(None, socket.gethostbyname, domain),
+                    timeout=2.0,
+                )
+                return domain, ip
+            except Exception:
+                return domain, ""
+        tasks = [resolve_one(d) for d in domains]
+        resolved = await asyncio.gather(*tasks, return_exceptions=True)
+        for item in resolved:
+            if isinstance(item, tuple):
+                domain, ip = item
+                if ip:
+                    results[domain] = ip
+        return results
+    def _add_shared_ip_edges(
+        self, domains: List[str], ips: Dict[str, str]
+    ) -> List[Tuple[int, int]]:
+        """
+        Create edges between nodes that share the same IP address.
+        Returns list of (src, dst) index pairs.
+        """
+        edges: List[Tuple[int, int]] = []
+        # Group domain indices by IP
+        ip_to_indices: Dict[str, List[int]] = {}
+        for idx, domain in enumerate(domains):
+            ip = ips.get(domain, "")
+            if ip:
+                ip_to_indices.setdefault(ip, []).append(idx)
+        # Create edges between all nodes sharing an IP
+        for ip, indices in ip_to_indices.items():
+            for i in range(len(indices)):
+                for j in range(i + 1, len(indices)):
+                    edges.append((indices[i], indices[j]))
+                    edges.append((indices[j], indices[i]))  # bidirectional
+        return edges
+    def _add_shared_tld_edges(self, domains: List[str]) -> List[Tuple[int, int]]:
+        """
+        Create edges between nodes that share the same suspicious TLD.
+        """
+        edges: List[Tuple[int, int]] = []
+        tld_to_indices: Dict[str, List[int]] = {}
+        for idx, domain in enumerate(domains):
+            for tld in SUSPICIOUS_TLDS:
+                if domain.endswith(tld):
+                    tld_to_indices.setdefault(tld, []).append(idx)
+                    break
+        for tld, indices in tld_to_indices.items():
+            for i in range(len(indices)):
+                for j in range(i + 1, len(indices)):
+                    edges.append((indices[i], indices[j]))
+                    edges.append((indices[j], indices[i]))
+        return edges
+    def build_graph(self, urls: List[str], resolve_dns: bool = False) -> dict:
+        """
+        Build a graph dict from a list of URLs.
+        Returns dict with:
+          - features: np.ndarray of shape (N, 12)
+          - edges: List of (src, dst) pairs
+          - node_count: int
+          - edge_count: int
+          - domains: List[str]
+        """
+        if not urls:
+            return {
+                "features": np.zeros((1, 12), dtype=np.float32),
+                "edges": [],
+                "node_count": 0,
+                "edge_count": 0,
+                "domains": [],
+            }
+        # Extract features for each URL
+        features = np.array(
+            [self.extract_node_features(url) for url in urls],
+            dtype=np.float32,
+        )
+        # Extract domains
+        domains: List[str] = []
+        for url in urls:
+            try:
+                parsed = urlparse(url if "://" in url else f"http://{url}")
+                domains.append((parsed.hostname or "").lower())
+            except Exception:
+                domains.append("")
+        # Build edges from shared TLDs (synchronous, fast)
+        edges = self._add_shared_tld_edges(domains)
+        # Optionally resolve DNS for shared IP edges
+        if resolve_dns and len(domains) > 1:
+            try:
+                loop = asyncio.get_event_loop()
+                if loop.is_running():
+                    # Already in async context
+                    pass
+                else:
+                    ips = loop.run_until_complete(self._resolve_ips(domains))
+                    edges.extend(self._add_shared_ip_edges(domains, ips))
+            except RuntimeError:
+                pass  # Cannot resolve in this context
+        return {
+            "features": features,
+            "edges": edges,
+            "node_count": len(urls),
+            "edge_count": len(edges),
+            "domains": domains,
+        }
+    def build_single_node_graph(self, url: str) -> dict:
+        """
+        Build a single-node graph for MLP fallback path.
+        Used when a graph has fewer than 2 nodes.
+        """
+        features = self.extract_node_features(url).reshape(1, -1)
+        return {
+            "features": features,
+            "edges": [],
+            "node_count": 1,
+            "edge_count": 0,
+            "domains": [url],
+        }
+# ── Legacy compatibility wrapper ─────────────────────────────────────
+_builder = DomainGraphBuilder()
+def build_domain_graph(urls: List[str]) -> dict:
+    """Legacy wrapper for backward compatibility."""
+    return _builder.build_graph(urls)

email_analyzer.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# ============================================================
+# PhishGuard AI - email_analyzer.py
+# Analyzes raw emails for phishing indicators.
+# Checks: sender authentication (SPF/DKIM/DMARC),
+#         brand spoofing, urgency language, and embedded links.
+#
+# Reuses BERT model from bert_analyzer to avoid duplicate loading.
+# ============================================================
+import email
+import re
+from email import policy
+from email.parser import BytesParser, Parser
+# Reuse the NLP analyzer from bert_analyzer
+from bert_analyzer import analyze_text as bert_analyze_text, _ensure_bert_loaded
+import bert_analyzer
+print("[PhishGuard] Email analyzer initialized (reusing shared NLP)")
+URGENCY_PATTERNS = [
+    r'(act now|immediate action|urgent|verify immediately|account suspended)',
+    r'(click here to (verify|confirm|update|restore))',
+    r'(your account (will be|has been) (suspended|closed|deactivated))',
+    r'(limited time|expires in \d+ hours?)',
+    r'(unusual (sign-in|login|activity) detected)',
+    r'(confirm your (identity|password|email|account))',
+    r'(we noticed (suspicious|unusual|unauthorized))',
+]
+BRAND_SPOOFS = [
+    'paypal','amazon','apple','microsoft','google','netflix',
+    'facebook','instagram','linkedin','twitter','chase','wellsfargo',
+    'bankofamerica','citibank','irs','fedex','ups','dhl',
+    'dropbox','docusign','zoom','office365','hdfc','icici','sbi'
+]
+def parse_email_msg(raw):
+    """Parse raw email bytes or string into an email.message object."""
+    if isinstance(raw, bytes):
+        return BytesParser(policy=policy.default).parsebytes(raw)
+    return Parser(policy=policy.default).parsestr(raw)
+def extract_urls(text: str) -> list:
+    """Extract all unique HTTP/HTTPS URLs from text."""
+    return list(set(re.findall(r'https?://[^\s<>"\'\\  ]+', text)))
+def get_body(msg) -> str:
+    """Extract plain text body from email message, falling back to HTML stripped of tags."""
+    parts = []
+    if msg.is_multipart():
+        for part in msg.walk():
+            ct = part.get_content_type()
+            if ct == 'text/plain':
+                try: parts.append(part.get_content())
+                except: pass
+            elif ct == 'text/html' and not parts:
+                try: parts.append(re.sub(r'<[^>]+>', ' ', part.get_content()))
+                except: pass
+    else:
+        try: parts.append(msg.get_content())
+        except: pass
+    return ' '.join(parts)
+def check_sender_auth(msg) -> dict:
+    """
+    Check email authentication headers:
+    - SPF (Sender Policy Framework)
+    - DKIM (DomainKeys Identified Mail)
+    - DMARC (Domain-based Message Authentication)
+    - From/Return-Path domain mismatch
+    - Free email provider usage
+    """
+    auth      = msg.get('Authentication-Results', '').lower()
+    spf_raw   = msg.get('Received-SPF', '').lower()
+    spf_pass  = 'spf=pass'  in auth or 'pass' in spf_raw
+    dkim_pass = 'dkim=pass' in auth
+    dmarc_pass= 'dmarc=pass'in auth
+    from_addr   = msg.get('From', '')
+    return_path = msg.get('Return-Path', '')
+    from_dom    = re.search(r'@([\w.-]+)', from_addr)
+    ret_dom     = re.search(r'@([\w.-]+)', return_path)
+    mismatch    = bool(from_dom and ret_dom and
+                       from_dom.group(1) != ret_dom.group(1))
+    free = {'gmail.com','yahoo.com','hotmail.com','outlook.com','protonmail.com'}
+    using_free = (from_dom.group(1).lower() in free) if from_dom else False
+    risk = 0
+    if not spf_pass:   risk += 25
+    if not dkim_pass:  risk += 20
+    if not dmarc_pass: risk += 15
+    if mismatch:       risk += 30
+    if using_free:     risk += 10
+    return {
+        "spf_pass": spf_pass, "dkim_pass": dkim_pass,
+        "dmarc_pass": dmarc_pass, "domain_mismatch": mismatch,
+        "using_free_email": using_free,
+        "auth_risk_score": min(risk, 100)
+    }
+def check_brand_spoofing(subject: str, body: str, sender: str) -> dict:
+    """Detect brand names mentioned in email content but not matching sender domain."""
+    combined   = (subject + ' ' + body + ' ' + sender).lower()
+    sender_dom = re.search(r'@([\w.-]+)', sender)
+    s_dom      = sender_dom.group(1).lower() if sender_dom else ''
+    spoofed    = [b for b in BRAND_SPOOFS
+                  if b in combined and b not in s_dom]
+    return {
+        "brand_spoof_detected": bool(spoofed),
+        "spoofed_brands": spoofed
+    }
+def check_urgency(text: str) -> dict:
+    """Detect urgency/pressure language patterns typical of phishing emails."""
+    matches = []
+    for pat in URGENCY_PATTERNS:
+        found = re.findall(pat, text.lower())
+        matches.extend(found)
+    return {
+        "urgency_detected": bool(matches),
+        "urgency_matches":  [str(m) for m in matches[:5]],
+        "urgency_score":    min(len(matches) * 15, 60)
+    }
+def bert_score(text: str) -> float:
+    """Run NLP classifier on email text and return phishing probability."""
+    if not text.strip():
+        return 0.1
+    try:
+        _ensure_bert_loaded()
+        if bert_analyzer._use_bert and bert_analyzer._classifier is not None:
+            result = bert_analyzer._classifier(text[:512])[0]
+            label  = result['label'].upper()
+            score  = result['score']
+            return score if ('SPAM' in label or label == 'LABEL_1') else 1 - score
+        else:
+            # Use keyword analysis from bert_analyzer
+            result = bert_analyze_text("", "", text)
+            return result.get("bert_phishing_prob", 0.3)
+    except:
+        return 0.3
+def analyze_email(raw, return_urls: bool = True) -> dict:
+    """
+    Full phishing analysis of a raw email.
+    Pass raw bytes or a string of the full email.
+    Combines: BERT NLP score + sender auth + brand spoofing + urgency detection.
+    """
+    msg     = parse_email_msg(raw)
+    subject = msg.get('Subject', '')
+    sender  = msg.get('From', '')
+    body    = get_body(msg)
+    urls    = extract_urls(body)
+    auth    = check_sender_auth(msg)
+    brand   = check_brand_spoofing(subject, body, sender)
+    urgency = check_urgency(subject + ' ' + body)
+    bert_p  = bert_score(subject + '. ' + body[:400])
+    raw_score = (bert_p * 40 +
+                 auth['auth_risk_score'] * 0.30 +
+                 urgency['urgency_score'] * 0.20 +
+                 (30 if brand['brand_spoof_detected'] else 0) * 0.10)
+    final = min(raw_score / 100, 1.0)
+    result = {
+        "is_phishing":          final > 0.60,
+        "phishing_probability": round(final, 4),
+        "subject":              subject,
+        "sender":               sender,
+        "auth_analysis":        auth,
+        "brand_analysis":       brand,
+        "urgency_analysis":     urgency,
+        "bert_score":           round(bert_p, 4),
+        "extracted_url_count":  len(urls),
+    }
+    if return_urls:
+        result["extracted_urls"] = urls[:20]
+    return result

feedback_store.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# ============================================================
+# PhishGuard AI - feedback_store.py
+# Thread-safe feedback storage, retraining trigger, analytics.
+#
+# Storage: feedback_data.jsonl (append-only, one JSON per line)
+# Lock: asyncio.Lock prevents concurrent writes & double-retrain
+# ============================================================
+from __future__ import annotations
+import os
+import json
+import time
+import asyncio
+import shutil
+import logging
+from datetime import datetime, timezone
+from typing import Optional
+logger = logging.getLogger("phishguard.feedback")
+_BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
+FEEDBACK_FILE = os.path.join(_BASE_DIR, "feedback_data.jsonl")
+STATE_FILE    = os.path.join(_BASE_DIR, "retrain_state.json")
+# ── Async lock for thread-safe writes ────────────────────────────────────────
+_write_lock   = asyncio.Lock()
+# ── Retrain state (persisted to retrain_state.json) ──────────────────────────
+_retrain_state = {
+    "model_version":       1,
+    "total_feedback":      0,
+    "unprocessed_count":   0,
+    "phishing_corrections": 0,
+    "safe_corrections":    0,
+    "last_retrain":        None,      # ISO 8601 timestamp
+    "retrain_history":     [],        # [{ts, samples, accuracy, version}]
+}
+def _load_state():
+    """Load persisted retrain state from disk."""
+    global _retrain_state
+    if os.path.exists(STATE_FILE):
+        try:
+            with open(STATE_FILE, "r") as f:
+                saved = json.load(f)
+            _retrain_state.update(saved)
+            logger.info(f"[FeedbackStore] State loaded | version={_retrain_state['model_version']} | total={_retrain_state['total_feedback']}")
+        except Exception as e:
+            logger.warning(f"[FeedbackStore] Could not load state: {e}")
+def _save_state():
+    """Persist retrain state to disk (atomic write)."""
+    try:
+        tmp = STATE_FILE + ".tmp"
+        with open(tmp, "w") as f:
+            json.dump(_retrain_state, f, indent=2, default=str)
+        os.replace(tmp, STATE_FILE)
+    except Exception as e:
+        logger.warning(f"[FeedbackStore] Could not save state: {e}")
+# Load state on module import
+_load_state()
+# ══════════════════════════════════════════════════════════════════════════════
+#  FEEDBACK STORAGE
+# ══════════════════════════════════════════════════════════════════════════════
+async def append_feedback(
+    url: str,
+    label: str,
+    source: str = "user_feedback",
+    original_prediction: Optional[float] = None,
+) -> dict:
+    """
+    Thread-safe append of a feedback entry to feedback_data.jsonl.
+    Returns: {"success": True, "feedback_count": N, "unprocessed": M}
+    """
+    entry = {
+        "url":                 url,
+        "label":               label,              # "phishing" or "safe"
+        "timestamp":           datetime.now(timezone.utc).isoformat(),
+        "source":              source,
+        "original_prediction": round(original_prediction, 4) if original_prediction is not None else None,
+    }
+    async with _write_lock:
+        try:
+            with open(FEEDBACK_FILE, "a") as f:
+                f.write(json.dumps(entry) + "\n")
+        except Exception as e:
+            logger.error(f"[FeedbackStore] Write failed: {e}")
+            return {"success": False, "error": str(e)}
+        # Update in-memory state
+        _retrain_state["total_feedback"]    += 1
+        _retrain_state["unprocessed_count"] += 1
+        if label == "phishing":
+            _retrain_state["phishing_corrections"] += 1
+        elif label == "safe":
+            _retrain_state["safe_corrections"] += 1
+        _save_state()
+    logger.info(f"[FeedbackStore] Saved | url={url} | label={label} | total={_retrain_state['total_feedback']}")
+    return {
+        "success":        True,
+        "feedback_count": _retrain_state["total_feedback"],
+        "unprocessed":    _retrain_state["unprocessed_count"],
+    }
+def get_unprocessed_count() -> int:
+    """Number of feedback entries since last retraining."""
+    return _retrain_state["unprocessed_count"]
+def get_model_version() -> int:
+    """Current model version number."""
+    return _retrain_state["model_version"]
+def get_stats() -> dict:
+    """Return feedback analytics for the /feedback/stats endpoint."""
+    return {
+        "total_feedback":       _retrain_state["total_feedback"],
+        "phishing_corrections": _retrain_state["phishing_corrections"],
+        "safe_corrections":     _retrain_state["safe_corrections"],
+        "unprocessed_count":    _retrain_state["unprocessed_count"],
+        "last_retrain":         _retrain_state["last_retrain"],
+        "model_version":        _retrain_state["model_version"],
+        "retrain_history":      _retrain_state["retrain_history"][-10:],  # last 10
+    }
+def get_recent_entries(n: int = 50) -> list:
+    """Read the last N feedback entries from the JSONL file."""
+    if not os.path.exists(FEEDBACK_FILE):
+        return []
+    try:
+        with open(FEEDBACK_FILE, "r") as f:
+            lines = f.readlines()
+        entries = []
+        for line in lines[-(n):]:
+            line = line.strip()
+            if line:
+                entries.append(json.loads(line))
+        return entries
+    except Exception:
+        return []
+# ══════════════════════════════════════════════════════════════════════════════
+#  RETRAINING PIPELINE
+# ══════════════════════════════════════════════════════════════════════════════
+RETRAIN_THRESHOLD = 50
+_retrain_running  = False
+def should_retrain() -> bool:
+    """Check if retraining should be triggered."""
+    return (
+        _retrain_state["unprocessed_count"] >= RETRAIN_THRESHOLD
+        and not _retrain_running
+    )
+def mark_retrain_complete(samples: int, accuracy: float):
+    """
+    Called after successful retraining.
+    Increments model_version, resets unprocessed counter, logs history.
+    """
+    _retrain_state["model_version"]     += 1
+    _retrain_state["unprocessed_count"]  = 0
+    _retrain_state["last_retrain"]       = datetime.now(timezone.utc).isoformat()
+    _retrain_state["retrain_history"].append({
+        "timestamp": _retrain_state["last_retrain"],
+        "samples":   samples,
+        "accuracy":  round(accuracy, 4),
+        "version":   _retrain_state["model_version"],
+    })
+    # Keep only last 50 history entries
+    if len(_retrain_state["retrain_history"]) > 50:
+        _retrain_state["retrain_history"] = _retrain_state["retrain_history"][-50:]
+    _save_state()
+    logger.info(
+        f"[FeedbackStore] Retrained on {samples} feedback samples. "
+        f"New accuracy: {accuracy:.2%}. Model version: {_retrain_state['model_version']}"
+    )
+def archive_feedback_file():
+    """Move the processed feedback file to a timestamped backup."""
+    if os.path.exists(FEEDBACK_FILE):
+        archive = FEEDBACK_FILE + f".{int(time.time())}.bak"
+        try:
+            shutil.move(FEEDBACK_FILE, archive)
+            logger.info(f"[FeedbackStore] Archived feedback → {archive}")
+        except Exception as e:
+            logger.warning(f"[FeedbackStore] Archive failed: {e}")
+def load_feedback_entries() -> list:
+    """Load ALL entries from the feedback JSONL file."""
+    if not os.path.exists(FEEDBACK_FILE):
+        return []
+    entries = []
+    try:
+        with open(FEEDBACK_FILE, "r") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    entries.append(json.loads(line))
+    except Exception as e:
+        logger.error(f"[FeedbackStore] Read failed: {e}")
+    return entries

generate_icons.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3
+"""Generate PhishGuard extension icons at 16, 48, 128 px using pure Python."""
+import struct, zlib, os
+def create_png(width, height, pixels):
+    """Create a minimal PNG from RGBA pixel data."""
+    def chunk(chunk_type, data):
+        c = chunk_type + data
+        return struct.pack('>I', len(data)) + c + struct.pack('>I', zlib.crc32(c) & 0xffffffff)
+    raw = b''
+    for y in range(height):
+        raw += b'\x00'
+        for x in range(width):
+            idx = (y * width + x) * 4
+            raw += bytes(pixels[idx:idx+4])
+    return (b'\x89PNG\r\n\x1a\n' +
+            chunk(b'IHDR', struct.pack('>IIBBBBB', width, height, 8, 6, 0, 0, 0)) +
+            chunk(b'IDAT', zlib.compress(raw)) +
+            chunk(b'IEND', b''))
+def draw_shield_icon(size):
+    """Draw a shield icon with checkmark."""
+    pixels = [0] * (size * size * 4)
+    cx, cy = size / 2, size / 2
+    sr, sg, sb = 0x53, 0x4A, 0xB7
+    hr, hg, hb = 0x7B, 0x73, 0xD4
+    for y in range(size):
+        for x in range(size):
+            idx = (y * size + x) * 4
+            nx = (x - cx) / (size / 2)
+            ny = (y - cy) / (size / 2)
+            in_shield = False
+            if ny < -0.05:
+                if abs(nx) < 0.75:
+                    in_shield = True
+            elif ny < 0.5:
+                w = 0.75 * (1 - ny * 0.8)
+                if abs(nx) < w:
+                    in_shield = True
+            else:
+                w = 0.75 * max(0, (1.0 - ny) * 1.4)
+                if abs(nx) < w:
+                    in_shield = True
+            if ny < -0.8:
+                in_shield = False
+            if in_shield:
+                blend = max(0, min(1, 0.5 - nx * 0.3 - ny * 0.2))
+                r = int(sr + (hr - sr) * blend)
+                g = int(sg + (hg - sg) * blend)
+                b = int(sb + (hb - sb) * blend)
+                pixels[idx:idx+4] = [r, g, b, 255]
+            else:
+                pixels[idx:idx+4] = [0, 0, 0, 0]
+    if size >= 32:
+        check_points = []
+        for t in range(100):
+            p = t / 100.0
+            if p < 0.4:
+                pp = p / 0.4
+                px = int(cx + (-0.25 + pp * 0.25) * size * 0.6)
+                py = int(cy + (-0.1 + pp * 0.3) * size * 0.6)
+            else:
+                pp = (p - 0.4) / 0.6
+                px = int(cx + (0.0 + pp * 0.35) * size * 0.6)
+                py = int(cy + (0.2 - pp * 0.45) * size * 0.6)
+            check_points.append((px, py))
+        thickness = max(1, int(size * 0.06))
+        for px, py in check_points:
+            for dy in range(-thickness, thickness+1):
+                for dx in range(-thickness, thickness+1):
+                    xx, yy = px + dx, py + dy
+                    if 0 <= xx < size and 0 <= yy < size:
+                        idx = (yy * size + xx) * 4
+                        if pixels[idx+3] > 0:
+                            pixels[idx:idx+4] = [255, 255, 255, 240]
+    return pixels
+icons_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'extension', 'icons')
+os.makedirs(icons_dir, exist_ok=True)
+for size in [16, 48, 128]:
+    pixels = draw_shield_icon(size)
+    png_data = create_png(size, size, pixels)
+    path = os.path.join(icons_dir, f'icon{size}.png')
+    with open(path, 'wb') as f:
+        f.write(png_data)
+    print(f"Created {path} ({len(png_data)} bytes)")
+print("Done! All icons generated.")

gmail_scanner.js ADDED Viewed

	@@ -0,0 +1,193 @@

+// gmail_scanner.js
+console.log("PhishGuard AI: Gmail Scanner loaded.");
+// Gmail's DOM can change, but commonly the email text is stored in elements with class '.a3s' or '.ii.gt'
+const EMAIL_BODY_SELECTOR = '.a3s, .ii.gt';
+// Function to inject a visible warning banner into the Gmail UI
+function injectWarningBanner(emailContainer, message) {
+    // Prevent duplicate banners if one is already injected
+    if (emailContainer.parentNode.querySelector('.phishguard-banner')) {
+        return;
+    }
+    const banner = document.createElement('div');
+    banner.className = 'phishguard-banner';
+    // Styling the banner to look native but urgent, fitting Google's Material Design
+    banner.style.backgroundColor = '#fce8e6';
+    banner.style.color = '#c5221f';
+    banner.style.border = '1px solid #faa59f';
+    banner.style.borderRadius = '8px';
+    banner.style.padding = '12px 16px';
+    // Added margin to ensure it doesn't overlap text awkwardly
+    banner.style.margin = '16px auto';
+    banner.style.fontFamily = '"Google Sans", Roboto, Arial, sans-serif';
+    banner.style.fontSize = '14px';
+    banner.style.fontWeight = '500';
+    banner.style.lineHeight = '20px';
+    banner.style.display = 'flex';
+    banner.style.alignItems = 'center';
+    banner.style.boxShadow = '0 1px 2px 0 rgba(60,64,67,0.3), 0 1px 3px 1px rgba(60,64,67,0.15)';
+    // Add SVG Icon for warning
+    const iconSvg = `
+        <svg focusable="false" width="24" height="24" viewBox="0 0 24 24" style="fill: #c5221f; margin-right: 16px; flex-shrink: 0;">
+            <path d="M1 21h22L12 2 1 21zm12-3h-2v-2h2v2zm0-4h-2v-4h2v4z"></path>
+        </svg>
+    `;
+    const textContent = document.createElement('span');
+    textContent.innerText = message || '🚨 PhishGuard Warning: This email contains suspicious links.';
+    // Construct the banner
+    banner.innerHTML = iconSvg;
+    banner.appendChild(textContent);
+    // Insert banner at the top of the email container.
+    // By inserting before the email container, we keep it visible at the top of the body.
+    if (emailContainer.parentNode) {
+        emailContainer.parentNode.insertBefore(banner, emailContainer);
+    }
+}
+// Helper function to extract all unique URLs from the email body
+function extractUrlsFromBody(emailContainer) {
+    const links = emailContainer.querySelectorAll('a[href]');
+    const urls = new Set(); // Use a Set to store unique URLs
+    links.forEach(link => {
+        const href = link.href;
+        // Basic filter to ignore mailto: or javascript: links, and only keep http/https
+        if (href && (href.startsWith('http://') || href.startsWith('https://'))) {
+            urls.add(href);
+        }
+    });
+    return Array.from(urls);
+}
+// Function to safely extract the sender's actual email address
+function extractSenderEmail(emailContainer) {
+    // Gmail usually groups each email message into a container block.
+    // We traverse up to find a common parent containing both header and body.
+    // '.kv' or 'table' or '.adn' is often the parent wrapper for an individual message.
+    let messageWrapper = emailContainer.closest('.adn') || emailContainer.closest('.kv') || document;
+    // The sender element usually has the class '.gD' and contains the 'email' attribute.
+    const senderElement = messageWrapper.querySelector('.gD');
+    if (senderElement && senderElement.getAttribute('email')) {
+        return senderElement.getAttribute('email');
+    }
+    // Fallback: Sometimes the email address is enclosed in brackets inside a '.go' element.
+    const fallbackSenderElement = messageWrapper.querySelector('.go');
+    if (fallbackSenderElement && fallbackSenderElement.innerText) {
+        // e.g. "<sender@example.com>" -> matched and extracted
+        const match = fallbackSenderElement.innerText.match(/<([^>]+)>/);
+        if (match && match[1]) {
+            return match[1];
+        }
+    }
+    return "Unknown Sender";
+}
+// Function to handle newly opened emails
+function handleEmailOpened(emailContainer) {
+    // Prevent re-scanning the same email element
+    if (emailContainer.dataset.pgScanned === "true") {
+        return;
+    }
+    console.log("PhishGuard AI: New email thread opened. Extracting data...");
+    // Mark as scanned
+    emailContainer.dataset.pgScanned = "true";
+    // 1. Extract plain text content
+    const emailBodyText = emailContainer.innerText;
+    // 2. Extract embedded URLs
+    const urls = extractUrlsFromBody(emailContainer);
+    // 3. Extract sender email address
+    const sender = extractSenderEmail(emailContainer);
+    // 4. Extract subject line (.hP is a standard class for Gmail's main subject line)
+    // Sometimes the subject might be in a '.bog' element. We'll default to 'h2.hP'.
+    const subjectElement = document.querySelector('h2.hP') || document.querySelector('.bog');
+    const subject = subjectElement ? subjectElement.innerText.trim() : "No Subject Found";
+    // Package the extracted data into a JSON payload
+    const emailPayload = {
+        sender: sender,
+        subject: subject,
+        body: emailBodyText,
+        urls: urls,
+        timestamp: new Date().toISOString()
+    };
+    console.log("PhishGuard AI extracted payload:", emailPayload);
+    // Send background message to service worker
+    chrome.runtime.sendMessage(
+        {
+            action: "analyzeEmail",
+            data: emailPayload
+        },
+        (response) => {
+            if (chrome.runtime.lastError) {
+                console.error("PhishGuard AI: Error communicating with background script:", chrome.runtime.lastError);
+                return;
+            }
+            console.log("PhishGuard AI Background Analysis Response:", response);
+            // Assume the background script returns `response.analysis` containing `probability` or `isPhishing` flag
+            const analysis = response && response.analysis ? response.analysis : {};
+            if (analysis.isPhishing === true || analysis.probability > 0.70) {
+                console.warn("PhishGuard AI: High risk email detected! Injecting banner...");
+                injectWarningBanner(
+                    emailContainer,
+                    '🚨 PhishGuard Warning: This email contains suspicious links and exhibits high-risk phishing behavior.'
+                );
+            }
+        }
+    );
+}
+// Set up a MutationObserver to watch for DOM changes
+// This effectively detects when Gmail dynamically loads an individual email view into the DOM
+const observer = new MutationObserver((mutationsList) => {
+    for (const mutation of mutationsList) {
+        if (mutation.type === 'childList') {
+            mutation.addedNodes.forEach(node => {
+                if (node.nodeType === Node.ELEMENT_NODE) {
+                    // Check if the added node itself is the email body container
+                    if (node.matches && node.matches(EMAIL_BODY_SELECTOR)) {
+                        handleEmailOpened(node);
+                    }
+                    // Also search securely within the added node structure for the email body
+                    if (node.querySelectorAll) {
+                        const emailBodies = node.querySelectorAll(EMAIL_BODY_SELECTOR);
+                        emailBodies.forEach(body => handleEmailOpened(body));
+                    }
+                }
+            });
+        }
+    }
+});
+// Start observing the document body for deeper added nodes (like when navigating between emails)
+observer.observe(document.body, {
+    childList: true,
+    subtree: true
+});
+console.log("PhishGuard AI: MutationObserver is listening for email thread opens.");

gnn_inference.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# ============================================================
+# PhishGuard AI - gnn/gnn_inference.py
+# GNN inference wrapper for main.py.
+# Loads model once at startup, reuses for every request.
+# Supports: predict, hot-reload, incremental_update.
+# ============================================================
+from __future__ import annotations
+import os
+import sys
+import random
+import logging
+from pathlib import Path
+from typing import List, Optional, Tuple
+import torch
+logger = logging.getLogger("phishguard.gnn.inference")
+# Add parent paths
+_GNN_DIR = Path(__file__).parent
+_BACKEND_DIR = _GNN_DIR.parent
+sys.path.insert(0, str(_GNN_DIR))
+sys.path.insert(0, str(_BACKEND_DIR))
+from domain_graph_builder import DomainGraphBuilder
+from gnn_model import load_gnn_model, PhishMLP, PYGEOM_AVAILABLE, INPUT_DIM
+if PYGEOM_AVAILABLE:
+    from gnn_model import PhishGNN
+MODEL_PATH = _GNN_DIR / "gnn_weights.pt"
+REPLAY_BUFFER_PATH = _BACKEND_DIR / "data" / "gnn_replay_buffer.pt"
+class GNNInference:
+    """
+    GNN inference wrapper with hot-reload and incremental update support.
+    """
+    def __init__(self, weights_path: Optional[Path] = None) -> None:
+        self._weights_path = weights_path or MODEL_PATH
+        self._model: Optional[torch.nn.Module] = None
+        self._builder = DomainGraphBuilder()
+        self._loaded = False
+    def load(self, weights_path: Optional[Path] = None) -> bool:
+        """Load GNN model from weights file."""
+        path = weights_path or self._weights_path
+        self._model = load_gnn_model(str(path) if path.exists() else None)
+        self._loaded = self._model is not None
+        if self._loaded:
+            logger.info(f"GNN model loaded from {path}")
+        return self._loaded
+    def predict(self, url: str, related_urls: Optional[List[str]] = None) -> float:
+        """
+        Predict phishing probability for a URL.
+        Returns P_gnn ∈ [0,1].
+        Falls back to MLP if model unavailable or graph too small.
+        """
+        if not self._loaded:
+            self.load()
+        if self._model is None:
+            return 0.5  # Neutral when model unavailable
+        urls = [url] + (related_urls or [])
+        # Single URL → MLP fallback path
+        if len(urls) == 1:
+            graph = self._builder.build_single_node_graph(url)
+        else:
+            graph = self._builder.build_graph(urls)
+        x = torch.tensor(graph["features"], dtype=torch.float)
+        edges = graph["edges"]
+        if edges and len(edges) > 0:
+            edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
+        else:
+            n = x.size(0)
+            edge_index = torch.arange(n).unsqueeze(0).repeat(2, 1)
+        prob = self._model.predict_proba(x, edge_index)
+        return round(float(prob), 4)
+    def reload(self, weights_path: Optional[Path] = None) -> bool:
+        """Hot-reload model with new weights (no server restart needed)."""
+        path = weights_path or self._weights_path
+        new_model = load_gnn_model(str(path))
+        if new_model is not None:
+            self._model = new_model
+            self._loaded = True
+            logger.info(f"GNN model hot-reloaded from {path}")
+            return True
+        logger.warning(f"GNN hot-reload failed from {path}")
+        return False
+    def incremental_update(
+        self,
+        samples: List[Tuple[str, int]],
+        replay_buffer_path: Optional[Path] = None,
+        lr: float = 5e-4,
+        epochs: int = 5,
+    ) -> Optional[float]:
+        """
+        Incremental update on feedback samples + replay buffer.
+        Returns accuracy_delta or None if failed.
+        samples: list of (url, label) where label is 0 or 1
+        """
+        if self._model is None:
+            logger.warning("GNN not loaded, cannot incrementally update")
+            return None
+        if len(samples) < 5:
+            logger.warning(f"Too few samples ({len(samples)}) for GNN update")
+            return None
+        try:
+            import torch.nn.functional as F
+            device = torch.device("cpu")
+            model = self._model.to(device)
+            builder = DomainGraphBuilder()
+            # Build graphs from new feedback
+            new_graphs = []
+            CHUNK = 4
+            phish = [url for url, label in samples if label == 1]
+            legit = [url for url, label in samples if label == 0]
+            for urls, label in [(phish, 1), (legit, 0)]:
+                for i in range(0, len(urls), CHUNK):
+                    chunk = urls[i:i + CHUNK]
+                    if not chunk:
+                        continue
+                    graph = builder.build_graph(chunk)
+                    x = torch.tensor(graph["features"], dtype=torch.float)
+                    edges = graph["edges"]
+                    if edges:
+                        ei = torch.tensor(edges, dtype=torch.long).t().contiguous()
+                    else:
+                        n = x.size(0)
+                        ei = torch.arange(n).unsqueeze(0).repeat(2, 1)
+                    new_graphs.append({
+                        "x": x, "edge_index": ei,
+                        "y": torch.tensor([float(label)]),
+                    })
+            # Load replay buffer (20% mix)
+            buf_path = replay_buffer_path or REPLAY_BUFFER_PATH
+            replay_graphs = []
+            if buf_path.exists():
+                try:
+                    all_replay = torch.load(buf_path, map_location="cpu", weights_only=False)
+                    replay_count = max(1, len(all_replay) // 5)  # 20%
+                    replay_graphs = random.sample(all_replay, min(replay_count, len(all_replay)))
+                except Exception as e:
+                    logger.warning(f"Replay buffer load failed: {e}")
+            # Merge: 80% new + 20% replay
+            dataset = new_graphs + replay_graphs
+            random.shuffle(dataset)
+            if not dataset:
+                return None
+            # Pre-update accuracy
+            model.eval()
+            pre_correct = 0
+            with torch.no_grad():
+                for item in dataset:
+                    out = model(item["x"].to(device), item["edge_index"].to(device))
+                    pred = 1 if out.squeeze().item() >= 0.5 else 0
+                    pre_correct += int(pred == int(item["y"].item()))
+            pre_acc = pre_correct / len(dataset)
+            # Train
+            optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
+            model.train()
+            for epoch in range(epochs):
+                random.shuffle(dataset)
+                total_loss = 0.0
+                for item in dataset:
+                    x = item["x"].to(device)
+                    ei = item["edge_index"].to(device)
+                    y = item["y"].to(device)
+                    optimizer.zero_grad()
+                    out = model(x, ei)
+                    loss = F.binary_cross_entropy(out.squeeze(), y.squeeze())
+                    loss.backward()
+                    optimizer.step()
+                    total_loss += loss.item()
+                logger.info(f"GNN incremental epoch {epoch+1}/{epochs}, loss={total_loss/len(dataset):.4f}")
+            # Post-update accuracy
+            model.eval()
+            post_correct = 0
+            with torch.no_grad():
+                for item in dataset:
+                    out = model(item["x"].to(device), item["edge_index"].to(device))
+                    pred = 1 if out.squeeze().item() >= 0.5 else 0
+                    post_correct += int(pred == int(item["y"].item()))
+            post_acc = post_correct / len(dataset)
+            delta = post_acc - pre_acc
+            self._model = model
+            # Save weights
+            torch.save(model.state_dict(), self._weights_path)
+            logger.info(f"GNN incremental update: {pre_acc:.4f} → {post_acc:.4f} (Δ={delta:+.4f})")
+            # Update replay buffer (rolling 500)
+            try:
+                existing = []
+                if buf_path.exists():
+                    existing = torch.load(buf_path, map_location="cpu", weights_only=False)
+                combined = existing + new_graphs
+                if len(combined) > 500:
+                    combined = combined[-500:]
+                buf_path.parent.mkdir(parents=True, exist_ok=True)
+                torch.save(combined, buf_path)
+            except Exception as e:
+                logger.warning(f"Replay buffer update failed: {e}")
+            return round(delta, 4)
+        except Exception as e:
+            logger.error(f"GNN incremental update failed: {e}")
+            return None
+    @property
+    def is_loaded(self) -> bool:
+        return self._loaded
+# ── Legacy compatibility functions ───────────────────────────────────
+_inference = GNNInference()
+def analyze_url_with_gnn(url: str, related_urls: list = None) -> dict:
+    """Legacy wrapper for backward compatibility."""
+    if not _inference.is_loaded:
+        _inference.load()
+    if not _inference.is_loaded:
+        return {
+            "gnn_phish_prob": None,
+            "tier3_status": "model_not_loaded",
+            "node_count": 0,
+            "edge_count": 0,
+            "graph_suspicious": False,
+        }
+    prob = _inference.predict(url, related_urls)
+    return {
+        "gnn_phish_prob": prob,
+        "node_count": 1 + len(related_urls or []),
+        "edge_count": 0,
+        "graph_suspicious": prob > 0.6,
+    }
+def reload_model(new_weights_path: str = None) -> bool:
+    path = Path(new_weights_path) if new_weights_path else None
+    return _inference.reload(path)
+def is_model_loaded() -> bool:
+    return _inference.is_loaded

gnn_model.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# ============================================================
+# PhishGuard AI - gnn/gnn_model.py
+# GNN + MLP model definitions for phishing graph classification.
+#
+# PhishGNN: 3-layer GCN with global_mean_pool → Linear → Sigmoid
+#   GCNConv(12→64) → ReLU → GCNConv(64→32) → ReLU →
+#   GCNConv(32→16) → global_mean_pool → Linear(16→1) → Sigmoid
+#
+# PhishMLP: Fallback for single URL or when torch_geometric unavailable
+#   Linear(12→64) → ReLU → Dropout(0.3) → Linear(64→1) → Sigmoid
+# ============================================================
+from __future__ import annotations
+import os
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+logger = logging.getLogger("phishguard.gnn.model")
+INPUT_DIM: int = 12   # 12-dim node features
+HIDDEN_DIM: int = 64
+OUTPUT_DIM: int = 1   # binary: sigmoid output
+# ── Try importing PyTorch Geometric ──────────────────────────────────
+PYGEOM_AVAILABLE: bool = False
+try:
+    from torch_geometric.nn import GCNConv, global_mean_pool
+    PYGEOM_AVAILABLE = True
+    logger.info("PyTorch Geometric found — using full GCN model")
+except ImportError:
+    PYGEOM_AVAILABLE = False
+    logger.info("PyTorch Geometric not found — using MLP fallback")
+# ── PhishGNN: Full 3-layer Graph Convolutional Network ───────────────
+if PYGEOM_AVAILABLE:
+    class PhishGNN(nn.Module):
+        """
+        3-layer GCN for graph-level phishing classification.
+        Architecture from spec:
+          GCNConv(12→64) → ReLU → GCNConv(64→32) → ReLU →
+          GCNConv(32→16) → global_mean_pool → Linear(16→1) → Sigmoid
+        """
+        def __init__(
+            self,
+            in_channels: int = INPUT_DIM,
+            hidden: int = HIDDEN_DIM,
+            out_channels: int = OUTPUT_DIM,
+        ) -> None:
+            super().__init__()
+            self.conv1 = GCNConv(in_channels, hidden)         # 12 → 64
+            self.conv2 = GCNConv(hidden, hidden // 2)         # 64 → 32
+            self.conv3 = GCNConv(hidden // 2, hidden // 4)    # 32 → 16
+            self.fc = nn.Linear(hidden // 4, out_channels)    # 16 → 1
+        def forward(
+            self,
+            x: torch.Tensor,
+            edge_index: torch.Tensor,
+            batch: Optional[torch.Tensor] = None,
+        ) -> torch.Tensor:
+            # Handle empty edge_index
+            if edge_index.numel() == 0:
+                edge_index = torch.zeros((2, 0), dtype=torch.long, device=x.device)
+            x = F.relu(self.conv1(x, edge_index))
+            x = F.relu(self.conv2(x, edge_index))
+            x = F.relu(self.conv3(x, edge_index))
+            if batch is None:
+                batch = torch.zeros(x.size(0), dtype=torch.long, device=x.device)
+            x = global_mean_pool(x, batch)      # (batch_size, 16)
+            x = self.fc(x)                       # (batch_size, 1)
+            return torch.sigmoid(x)              # [0, 1]
+        def predict_proba(
+            self,
+            x: torch.Tensor,
+            edge_index: torch.Tensor,
+            batch: Optional[torch.Tensor] = None,
+        ) -> float:
+            """Return P_gnn ∈ [0,1] — probability of phishing."""
+            self.eval()
+            with torch.no_grad():
+                output = self.forward(x, edge_index, batch)
+                return output.squeeze().item()
+# ── PhishMLP: Fallback for single URL or no torch_geometric ──────────
+class PhishMLP(nn.Module):
+    """
+    MLP fallback for phishing classification.
+    Used when torch_geometric is unavailable or graph has < 2 nodes.
+    Architecture: Linear(12→64) → ReLU → Dropout(0.3) → Linear(64→1) → Sigmoid
+    """
+    def __init__(self, in_channels: int = INPUT_DIM) -> None:
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(in_channels, 64),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(64, 1),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        edge_index: Optional[torch.Tensor] = None,
+        batch: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Pool all node features to single vector via mean
+        if x.dim() == 2 and x.size(0) > 1:
+            x = x.mean(dim=0, keepdim=True)
+        elif x.dim() == 1:
+            x = x.unsqueeze(0)
+        out = self.net(x)
+        return torch.sigmoid(out)
+    def predict_proba(
+        self,
+        x: torch.Tensor,
+        edge_index: Optional[torch.Tensor] = None,
+        batch: Optional[torch.Tensor] = None,
+    ) -> float:
+        """Return P_gnn ∈ [0,1] — probability of phishing."""
+        self.eval()
+        with torch.no_grad():
+            output = self.forward(x, edge_index, batch)
+            return output.squeeze().item()
+# ── Model loading utility ────────────────────────────────────────────
+def load_gnn_model(model_path: Optional[str] = None) -> Optional[nn.Module]:
+    """
+    Load GNN or MLP model with optional trained weights.
+    Returns model in eval mode, or None if creation fails.
+    """
+    model: Optional[nn.Module] = None
+    try:
+        model = PhishGNN() if PYGEOM_AVAILABLE else PhishMLP()
+    except Exception as e:
+        logger.error(f"GNN model creation failed: {e}")
+        try:
+            model = PhishMLP()
+        except Exception as e2:
+            logger.error(f"MLP fallback creation also failed: {e2}")
+            return None
+    if model_path and os.path.exists(model_path):
+        try:
+            state = torch.load(model_path, map_location="cpu", weights_only=True)
+            model.load_state_dict(state)
+            logger.info(f"GNN weights loaded from {model_path}")
+        except RuntimeError as e:
+            logger.warning(f"GNN weights mismatch (architecture changed?): {e}")
+        except Exception as e:
+            logger.warning(f"GNN weight load failed: {e}")
+    elif model_path:
+        logger.info(f"GNN weights file not found: {model_path}")
+    else:
+        logger.info("No GNN weights path — using untrained model")
+    try:
+        model.eval()
+    except Exception as e:
+        logger.error(f"GNN eval() failed: {e}")
+        return None
+    return model
+# Legacy alias
+def load_model(model_path: Optional[str] = None) -> Optional[nn.Module]:
+    return load_gnn_model(model_path)

keep_alive.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# ============================================================
+# PhishGuard AI - keep_alive.py
+# Keeps your Render.com server awake 24/7.
+#
+# From architecture doc 5.3:
+#   Render free tier sleeps after 15 min of inactivity.
+#   This pings GET /health every 14 min to prevent that.
+#
+# WHERE TO RUN:
+#   - On a second laptop / old computer
+#   - On your phone using Termux (free Android app)
+#   - On a friend's computer
+#   - On your own laptop in a separate terminal (less ideal)
+#
+# HOW TO RUN:
+#   python keep_alive.py
+#   (keep this terminal window open — never close it)
+# ============================================================
+import time
+import requests
+import datetime
+# !! CHANGE THIS to your actual Render URL !!
+API_URL  = "https://YOUR-APP-NAME.onrender.com/health"
+INTERVAL = 14 * 60  # 14 minutes in seconds
+print("=" * 50)
+print("PhishGuard Keep-Alive Script")
+print("=" * 50)
+print(f"Pinging: {API_URL}")
+print(f"Every:   14 minutes")
+print(f"Started: {datetime.datetime.now():%Y-%m-%d %H:%M:%S}")
+print("\nDO NOT close this window!")
+print("Press Ctrl+C to stop.\n")
+ping_count = 0
+while True:
+    try:
+        r          = requests.get(API_URL, timeout=15)
+        ping_count += 1
+        status     = "OK" if r.status_code == 200 else f"ERROR {r.status_code}"
+        print(f"[{datetime.datetime.now():%H:%M:%S}] Ping #{ping_count} → {status}")
+    except requests.exceptions.ConnectionError:
+        print(f"[{datetime.datetime.now():%H:%M:%S}] Connection failed — server might be waking up...")
+    except Exception as e:
+        print(f"[{datetime.datetime.now():%H:%M:%S}] Error: {e}")
+    time.sleep(INTERVAL)

main.py ADDED Viewed

	@@ -0,0 +1,699 @@

+# ============================================================
+# PhishGuard AI - main.py
+# FastAPI orchestrator — Full 4-tier phishing detection pipeline
+# with feedback-driven incremental retraining.
+#
+# Endpoints:
+#   POST /analyze       → 4-tier URL phishing analysis
+#   POST /analyze/email → BERT-only email body analysis
+#   POST /retrain       → Incremental model retraining
+#   GET  /model_version → Current model version info
+#   GET  /health        → All model load statuses
+#
+# Architecture:
+#   Tier 1: Whitelist O(1) → SAFE exit (~55% traffic)
+#   Tier 2: Heuristic 15 signals → BLOCK if >= 80 (~15% blocked)
+#   Tier 3: BERT+GNN parallel → BLOCK/SAFE/escalate (~15% exits)
+#   Tier 4: CNN visual + brand hash → BLOCK/SAFE (~15% borderline)
+# ============================================================
+from __future__ import annotations
+import os
+import sys
+import asyncio
+import time
+import hashlib
+import logging
+import logging.handlers
+from collections import OrderedDict
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import List, Optional
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# ── Path setup ────────────────────────────────────────────────────────
+BASE_DIR = Path(__file__).parent
+for sub_dir in ["gnn", "cnn"]:
+    sub_path = BASE_DIR / sub_dir
+    if sub_path.is_dir():
+        sys.path.insert(0, str(sub_path))
+# ── Logging ───────────────────────────────────────────────────────────
+log_dir = BASE_DIR / "logs"
+log_dir.mkdir(exist_ok=True)
+_handler = logging.handlers.RotatingFileHandler(
+    log_dir / "phishguard.log",
+    maxBytes=5 * 1024 * 1024,
+    backupCount=3,
+    encoding="utf-8",
+)
+_handler.setFormatter(logging.Formatter(
+    "%(asctime)s | %(levelname)-7s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+))
+logger = logging.getLogger("phishguard")
+logger.setLevel(logging.INFO)
+logger.addHandler(_handler)
+logger.addHandler(logging.StreamHandler())
+# ── Import project modules ───────────────────────────────────────────
+from url_heuristics import HeuristicScorer, HeuristicResult
+from bert_analyzer import BERTPhishingClassifier
+# GNN imports
+GNN_AVAILABLE = False
+gnn_inference = None
+try:
+    from gnn.gnn_inference import GNNInference
+    GNN_AVAILABLE = True
+except ImportError:
+    try:
+        from gnn_inference import GNNInference
+        GNN_AVAILABLE = True
+    except ImportError:
+        logger.warning("GNN module not available")
+# CNN imports
+CNN_AVAILABLE = False
+cnn_inference = None
+brand_detector = None
+try:
+    from cnn.cnn_inference import CNNInference
+    from cnn.screenshot_hasher import BrandHashDetector
+    from cnn.cnn_model import preprocess_screenshot
+    CNN_AVAILABLE = True
+except ImportError:
+    try:
+        from cnn_inference import CNNInference
+        from screenshot_hasher import BrandHashDetector
+        from cnn_model import preprocess_screenshot
+        CNN_AVAILABLE = True
+    except ImportError:
+        logger.warning("CNN module not available")
+from tier3_bert_gnn import Tier3Ensemble
+from retraining_service import RetrainingService, FeedbackRecord, RetrainResult
+# ── Whitelist (Tier 1) ────────────────────────────────────────────────
+WHITELIST: set[str] = {
+    "google.com", "youtube.com", "facebook.com", "amazon.com", "wikipedia.org",
+    "twitter.com", "instagram.com", "linkedin.com", "microsoft.com", "apple.com",
+    "github.com", "stackoverflow.com", "reddit.com", "netflix.com", "paypal.com",
+    "bankofamerica.com", "chase.com", "wellsfargo.com", "yahoo.com", "bing.com",
+    "outlook.com", "office.com", "live.com", "adobe.com", "dropbox.com",
+    "zoom.us", "slack.com", "spotify.com", "twitch.tv", "ebay.com",
+    "walmart.com", "target.com", "bestbuy.com", "airbnb.com",
+    "x.com", "tiktok.com", "pinterest.com", "quora.com", "medium.com",
+}
+def get_root_domain(url: str) -> str:
+    """Extract root domain from a URL."""
+    from urllib.parse import urlparse
+    try:
+        host = urlparse(url).hostname or ""
+        host = host.replace("www.", "")
+        parts = host.split(".")
+        return ".".join(parts[-2:]) if len(parts) >= 2 else host
+    except Exception:
+        return ""
+# ── URL Cache (LRU, 30-min TTL) ──────────────────────────────────────
+CACHE_TTL = 30 * 60
+CACHE_MAX = 500
+class URLCache:
+    def __init__(self, maxsize: int = CACHE_MAX, ttl: int = CACHE_TTL) -> None:
+        self._cache: OrderedDict = OrderedDict()
+        self._maxsize = maxsize
+        self._ttl = ttl
+    def get(self, url: str) -> Optional[dict]:
+        if url in self._cache:
+            entry = self._cache[url]
+            if time.time() - entry["ts"] < self._ttl:
+                self._cache.move_to_end(url)
+                return entry["result"]
+            else:
+                del self._cache[url]
+        return None
+    def set(self, url: str, result: dict) -> None:
+        self._cache[url] = {"result": result, "ts": time.time()}
+        self._cache.move_to_end(url)
+        if len(self._cache) > self._maxsize:
+            self._cache.popitem(last=False)
+    def clear(self) -> None:
+        self._cache.clear()
+_url_cache = URLCache()
+# ── Request/Response Models ───────────────────────────────────────────
+class AnalyzeRequest(BaseModel):
+    url: str
+    heuristic_score: float = 0.0
+    page_title: str = ""
+    page_snippet: str = ""
+    related_urls: list = []
+class EmailRequest(BaseModel):
+    sender: str
+    subject: str = ""
+    body: str = ""
+    urls: list = []
+    timestamp: str = ""
+class FeedbackSample(BaseModel):
+    url: str
+    verdict: str = ""
+    confidence: float = 0.0
+    tier_used: int = 0
+    heuristic_score: int = 0
+    signals: list = []
+    user_feedback: Optional[str] = None
+    timestamp: str = ""
+    feedback_ts: Optional[str] = None
+    url_hash: str = ""
+    session_id: str = ""
+class RetrainRequest(BaseModel):
+    samples: List[FeedbackSample]
+    trigger: str = "count"
+    session_id: str = ""
+    extension_version: str = ""
+# ── Global state ──────────────────────────────────────────────────────
+_scorer: Optional[HeuristicScorer] = None
+_bert: Optional[BERTPhishingClassifier] = None
+_gnn: Optional[GNNInference] = None
+_cnn: Optional[CNNInference] = None
+_brand: Optional[BrandHashDetector] = None
+_tier3: Optional[Tier3Ensemble] = None
+_retrain_service: Optional[RetrainingService] = None
+_retrain_lock = asyncio.Lock()
+# ── Lifespan (startup/shutdown) ───────────────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Load all models at startup, clean up at shutdown."""
+    global _scorer, _bert, _gnn, _cnn, _brand, _tier3, _retrain_service
+    logger.info("=== PhishGuard AI starting up ===")
+    # Tier 2: Heuristic Scorer
+    _scorer = HeuristicScorer()
+    logger.info("✓ Tier 2: HeuristicScorer initialized")
+    # Tier 3a: BERT
+    _bert = BERTPhishingClassifier()
+    logger.info("✓ Tier 3a: BERT classifier initialized (lazy-load)")
+    # Tier 3b: GNN
+    if GNN_AVAILABLE:
+        _gnn = GNNInference()
+        _gnn.load()
+        logger.info(f"✓ Tier 3b: GNN loaded={_gnn.is_loaded}")
+    else:
+        _gnn = None
+        logger.warning("✗ Tier 3b: GNN not available")
+    # Tier 3 Ensemble
+    if _gnn:
+        _tier3 = Tier3Ensemble(_bert, _gnn)
+        logger.info("✓ Tier 3: Ensemble initialized")
+    else:
+        _tier3 = None
+        logger.warning("✗ Tier 3: Ensemble not available (GNN missing)")
+    # Tier 4: CNN + Brand Detection
+    if CNN_AVAILABLE:
+        _cnn = CNNInference()
+        _cnn.load()
+        _brand = BrandHashDetector()
+        logger.info(f"✓ Tier 4: CNN loaded={_cnn.is_loaded}, Brand hash DB loaded")
+    else:
+        _cnn = None
+        _brand = None
+        logger.warning("✗ Tier 4: CNN not available")
+    # Retraining Service
+    _retrain_service = RetrainingService(
+        bert_classifier=_bert,
+        gnn_inference=_gnn or GNNInference(),
+        cnn_inference=_cnn or (CNNInference() if CNN_AVAILABLE else None),
+    )
+    logger.info("✓ Retraining service initialized")
+    logger.info("=== PhishGuard AI ready ===")
+    yield
+    logger.info("=== PhishGuard AI shutting down ===")
+# ── FastAPI App ───────────────────────────────────────────────────────
+app = FastAPI(
+    title="PhishGuard AI Backend",
+    version="3.0",
+    description="4-tier ML phishing detection with feedback-driven retraining",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── POST /analyze — Full 4-tier pipeline ──────────────────────────────
+@app.post("/analyze")
+async def analyze_endpoint(req: AnalyzeRequest) -> dict:
+    """
+    Analyze a URL through the 4-tier phishing detection pipeline.
+    Tier 1: Whitelist → SAFE
+    Tier 2: Heuristic → BLOCK if >= 80
+    Tier 3: BERT+GNN ensemble → BLOCK/SAFE/escalate
+    Tier 4: CNN visual + brand hash → BLOCK/SAFE
+    """
+    url = req.url
+    details: dict = {}
+    # ── TIER 1: Whitelist ────────────────────────────────────────
+    root = get_root_domain(url)
+    if root in WHITELIST:
+        return {
+            "url": url,
+            "is_phishing": False,
+            "confidence": 0.0,
+            "method": "whitelist",
+            "status": "safe",
+            "tier": 1,
+            "heuristic_score": 0,
+            "signals": [],
+            "details": {"whitelisted_domain": root},
+        }
+    # ── Cache check ──────────────────────────────────────────────
+    cached = _url_cache.get(url)
+    if cached is not None:
+        return cached
+    # ── TIER 2: Heuristic scoring ────────────────────────────────
+    h_result: HeuristicResult = _scorer.score(url)
+    # Use the higher of server-side and browser-side heuristic scores
+    h_score = max(h_result.score, int(req.heuristic_score))
+    details["heuristic"] = {
+        "score": h_result.score,
+        "raw_score": h_result.raw_score,
+        "signals": h_result.signals,
+        "browser_score": int(req.heuristic_score),
+        "combined_score": h_score,
+    }
+    if h_score >= 80:
+        result = {
+            "url": url,
+            "is_phishing": True,
+            "confidence": h_score / 100.0,
+            "method": "heuristic",
+            "status": "blocked",
+            "tier": 2,
+            "heuristic_score": h_score,
+            "signals": h_result.signals,
+            "details": details,
+        }
+        _url_cache.set(url, result)
+        logger.info(f"Tier 2 BLOCK | url={url[:60]} | score={h_score}")
+        return result
+    # ── TIER 3: BERT + GNN Ensemble ──────────────────────────────
+    if _tier3 is not None:
+        try:
+            p3 = await _tier3.predict(
+                url=url,
+                title=req.page_title,
+                snippet=req.page_snippet,
+                h_score=h_score,
+            )
+            details["tier3_score"] = p3
+        except Exception as e:
+            logger.error(f"Tier 3 error: {e}")
+            p3 = h_score / 100.0  # fallback to heuristic
+            details["tier3_error"] = str(e)
+    else:
+        # Tier 3 unavailable — use BERT alone + heuristic
+        if _bert is not None:
+            loop = asyncio.get_event_loop()
+            try:
+                p_bert = await loop.run_in_executor(
+                    None, _bert.predict, url, req.page_title, req.page_snippet,
+                )
+            except Exception:
+                p_bert = 0.5
+            h_norm = h_score / 100.0
+            p3 = 0.60 * p_bert + 0.40 * h_norm
+        else:
+            p3 = h_score / 100.0
+        details["tier3_score"] = p3
+        details["tier3_note"] = "ensemble_unavailable"
+    # Tier 3 decision
+    decision = Tier3Ensemble.decide(p3)
+    if decision == "block":
+        result = {
+            "url": url,
+            "is_phishing": True,
+            "confidence": round(p3, 4),
+            "method": "bert_gnn_ensemble",
+            "status": "blocked",
+            "tier": 3,
+            "heuristic_score": h_score,
+            "signals": h_result.signals,
+            "details": details,
+        }
+        _url_cache.set(url, result)
+        logger.info(f"Tier 3 BLOCK | url={url[:60]} | P3={p3:.4f}")
+        return result
+    if decision == "safe":
+        result = {
+            "url": url,
+            "is_phishing": False,
+            "confidence": round(p3, 4),
+            "method": "bert_gnn_ensemble",
+            "status": "safe",
+            "tier": 3,
+            "heuristic_score": h_score,
+            "signals": h_result.signals,
+            "details": details,
+        }
+        _url_cache.set(url, result)
+        logger.info(f"Tier 3 SAFE | url={url[:60]} | P3={p3:.4f}")
+        return result
+    # ── TIER 4: CNN Visual + Brand Hash (borderline 0.40 ≤ P3 < 0.85)
+    if _cnn is not None and _cnn.is_loaded:
+        try:
+            # Capture screenshot
+            screenshot_bytes = await _capture_screenshot_for_tier4(url)
+            if screenshot_bytes:
+                # CNN prediction
+                p_cnn = _cnn.predict(screenshot_bytes)
+                details["cnn_prob"] = round(p_cnn, 4)
+                # Brand hash check
+                brand_boost = 0.0
+                if _brand is not None:
+                    is_impersonation, brand_name, brand_conf = _brand.detect(
+                        screenshot_bytes, url
+                    )
+                    details["brand"] = {
+                        "impersonation_detected": is_impersonation,
+                        "brand": brand_name,
+                        "confidence": round(brand_conf, 3),
+                    }
+                    if is_impersonation:
+                        brand_boost = 0.25
+                # P_final = 0.55·P3 + 0.30·P_cnn + brand_boost
+                p_final = min((0.55 * p3) + (0.30 * p_cnn) + brand_boost, 1.0)
+                details["tier4_score"] = round(p_final, 4)
+                is_phishing = p_final >= 0.65
+                result = {
+                    "url": url,
+                    "is_phishing": is_phishing,
+                    "confidence": round(p_final, 4),
+                    "method": "full_ensemble_bert_gnn_cnn",
+                    "status": "blocked" if is_phishing else "safe",
+                    "tier": 4,
+                    "heuristic_score": h_score,
+                    "signals": h_result.signals,
+                    "details": details,
+                }
+                _url_cache.set(url, result)
+                logger.info(f"Tier 4 {'BLOCK' if is_phishing else 'SAFE'} | url={url[:60]} | P_final={p_final:.4f}")
+                return result
+        except Exception as e:
+            logger.error(f"Tier 4 error: {e}")
+            details["tier4_error"] = str(e)
+    # Tier 4 unavailable/failed — use Tier 3 score with conservative threshold
+    is_phishing = p3 >= 0.65
+    result = {
+        "url": url,
+        "is_phishing": is_phishing,
+        "confidence": round(p3, 4),
+        "method": "bert_gnn_ensemble",
+        "status": "blocked" if is_phishing else "safe",
+        "tier": 3,
+        "heuristic_score": h_score,
+        "signals": h_result.signals,
+        "details": details,
+    }
+    _url_cache.set(url, result)
+    logger.info(f"Tier 4 fallback → Tier 3 | url={url[:60]} | P3={p3:.4f}")
+    return result
+async def _capture_screenshot_for_tier4(url: str) -> Optional[bytes]:
+    """Capture screenshot for Tier 4 CNN analysis."""
+    try:
+        from playwright.async_api import async_playwright
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            page = await browser.new_page(
+                viewport={"width": 1280, "height": 800},
+                user_agent=(
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
+                ),
+            )
+            # Block heavy resources
+            await page.route(
+                "**/*.{woff,woff2,ttf,eot,mp4,webm,ogg,wav,mp3}",
+                lambda route: route.abort(),
+            )
+            await page.goto(url, wait_until="domcontentloaded", timeout=10000)
+            screenshot = await page.screenshot(type="png")
+            await browser.close()
+            return screenshot
+    except Exception as e:
+        logger.warning(f"Tier 4 screenshot failed: {e}")
+        return None
+# ── POST /analyze/email ───────────────────────────────────────────────
+@app.post("/analyze/email")
+async def analyze_email_endpoint(req: EmailRequest) -> dict:
+    """BERT-only path for email body text analysis."""
+    # Sender whitelist check
+    sender_domain = req.sender.split("@")[-1].lower() if "@" in req.sender else ""
+    if sender_domain in WHITELIST:
+        return {
+            "status": "safe",
+            "analysis": {
+                "isPhishing": False,
+                "probability": 0.0,
+                "reason": "Trusted sender domain",
+            },
+        }
+    # Analyze embedded URLs
+    MAX_URLS = 3
+    urls_to_check = req.urls[:MAX_URLS]
+    if not urls_to_check:
+        # Text-only analysis
+        if _bert:
+            combined = f"{req.subject} {req.body}"
+            prob = _bert.predict(combined, req.subject, req.body)
+            is_phishing = prob > 0.6
+            return {
+                "status": "blocked" if is_phishing else "safe",
+                "analysis": {
+                    "isPhishing": is_phishing,
+                    "probability": prob,
+                    "reason": "BERT text analysis (no URLs)",
+                },
+            }
+        return {
+            "status": "safe",
+            "analysis": {
+                "isPhishing": False,
+                "probability": 0.1,
+                "reason": "No URLs and no ML model available",
+            },
+        }
+    # Analyze URLs through the main pipeline
+    tasks = [
+        analyze_endpoint(AnalyzeRequest(url=u, page_title=req.subject))
+        for u in urls_to_check
+    ]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    max_prob = 0.0
+    phishing_detected = False
+    flagged_urls = []
+    for idx, r in enumerate(results):
+        if isinstance(r, Exception):
+            continue
+        prob = r.get("confidence", 0.0)
+        max_prob = max(max_prob, prob)
+        if r.get("is_phishing"):
+            phishing_detected = True
+            flagged_urls.append(r.get("url", urls_to_check[idx]))
+    return {
+        "status": "blocked" if phishing_detected else "safe",
+        "analysis": {
+            "isPhishing": phishing_detected,
+            "probability": max_prob,
+            "flagged_urls": flagged_urls,
+            "reason": "URL analysis via ML ensemble",
+        },
+    }
+# ── POST /retrain — Incremental retraining ────────────────────────────
+@app.post("/retrain")
+async def retrain_endpoint(req: RetrainRequest) -> dict:
+    """
+    Receive labeled feedback and incrementally update all models.
+    Uses asyncio.Lock() to prevent concurrent retraining jobs.
+    Timeout: 600s max.
+    """
+    if _retrain_service is None:
+        return {"status": "error", "message": "Retraining service not initialized"}
+    # Prevent concurrent retraining
+    if _retrain_lock.locked():
+        return {
+            "status": "skipped",
+            "message": "Retraining already in progress",
+            "models_updated": [],
+        }
+    async with _retrain_lock:
+        # Convert Pydantic models to FeedbackRecord dataclasses
+        records = [
+            FeedbackRecord(
+                url=s.url,
+                verdict=s.verdict,
+                confidence=s.confidence,
+                tier_used=s.tier_used,
+                heuristic_score=s.heuristic_score,
+                signals=s.signals,
+                user_feedback=s.user_feedback,
+                timestamp=s.timestamp,
+                feedback_ts=s.feedback_ts,
+                url_hash=s.url_hash,
+                session_id=s.session_id,
+            )
+            for s in req.samples
+        ]
+        try:
+            result = await asyncio.wait_for(
+                _retrain_service.retrain(records),
+                timeout=600,
+            )
+            # Clear URL cache after retraining (stale results)
+            if result.status == "success":
+                _url_cache.clear()
+            return {
+                "status": result.status,
+                "models_updated": result.models_updated,
+                "samples_used": result.samples_used,
+                "duration_seconds": result.duration_seconds,
+                "accuracy_delta": result.accuracy_delta,
+                "next_retrain_hint": result.next_retrain_hint,
+            }
+        except asyncio.TimeoutError:
+            return {
+                "status": "error",
+                "message": "Retraining timed out (600s limit)",
+            }
+        except Exception as e:
+            logger.error(f"Retrain endpoint error: {e}")
+            return {
+                "status": "error",
+                "message": str(e),
+            }
+# ── GET /model_version ────────────────────────────────────────────────
+@app.get("/model_version")
+async def model_version_endpoint() -> dict:
+    """Return current model version info for extension polling."""
+    if _retrain_service:
+        return _retrain_service.get_version_info()
+    return {"version": 0, "updated_at": None, "accuracy": {}}
+# ── GET /health ───────────────────────────────────────────────────────
+@app.get("/health")
+async def health_endpoint() -> dict:
+    """Liveness probe with per-tier readiness and model statuses."""
+    return {
+        "status": "ok",
+        "version": "3.0",
+        "tier1": True,
+        "tier2": _scorer is not None,
+        "tier3": _tier3 is not None,
+        "tier4": _cnn is not None and _cnn.is_loaded if _cnn else False,
+        "retraining_in_progress": _retrain_lock.locked(),
+        "model_version": _retrain_service.model_version if _retrain_service else 0,
+        "modules": {
+            "heuristic": _scorer is not None,
+            "bert": _bert is not None and _bert.is_loaded,
+            "bert_lazy": _bert is not None and not _bert.is_loaded,
+            "gnn": _gnn is not None and _gnn.is_loaded if _gnn else False,
+            "cnn": _cnn is not None and _cnn.is_loaded if _cnn else False,
+            "brand_hash": _brand is not None,
+        },
+    }
+# ── Legacy feedback endpoint (backward compat) ───────────────────────
+@app.post("/feedback")
+async def legacy_feedback_endpoint(req: dict) -> dict:
+    """Legacy feedback endpoint for backward compatibility."""
+    return {"status": "success", "message": "Use POST /retrain for feedback-driven retraining"}
+# ── Run directly ──────────────────────────────────────────────────────
+# uvicorn main:app --reload --port 8000

manifest.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "manifest_version": 3,
+  "name": "PhishGuard AI",
+  "version": "3.0",
+  "description": "Adaptive ML-based phishing detection with feedback-driven retraining — BERT + GNN + CNN ensemble",
+  "permissions": [
+    "tabs",
+    "storage",
+    "webNavigation",
+    "alarms",
+    "notifications",
+    "activeTab",
+    "scripting"
+  ],
+  "host_permissions": [
+    "<all_urls>"
+  ],
+  "background": {
+    "service_worker": "background.js"
+  },
+  "content_scripts": [
+    {
+      "matches": ["<all_urls>"],
+      "js": ["content.js"],
+      "run_at": "document_idle"
+    }
+  ],
+  "action": {
+    "default_popup": "popup.html",
+    "default_title": "PhishGuard AI"
+  },
+  "icons": {
+    "16": "icons/icon16.png",
+    "48": "icons/icon48.png",
+    "128": "icons/icon128.png"
+  }
+}

popup.html ADDED Viewed

	@@ -0,0 +1,432 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>PhishGuard AI</title>
+  <style>
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
+    * { margin: 0; padding: 0; box-sizing: border-box; }
+    :root {
+      --bg-primary:   #0F0F14;
+      --bg-secondary: #1A1A24;
+      --bg-card:      #22222E;
+      --bg-hover:     #2A2A38;
+      --text-primary: #EAEAF0;
+      --text-secondary: #8888A0;
+      --text-muted:   #5A5A72;
+      --accent:       #534AB7;
+      --accent-glow:  rgba(83, 74, 183, 0.35);
+      --safe:         #22C55E;
+      --safe-glow:    rgba(34, 197, 94, 0.25);
+      --danger:       #EF4444;
+      --danger-glow:  rgba(239, 68, 68, 0.25);
+      --warning:      #F59E0B;
+      --warning-glow: rgba(245, 158, 11, 0.25);
+      --border:       rgba(255,255,255,0.06);
+      --radius:       12px;
+      --radius-sm:    8px;
+    }
+    body {
+      width: 380px;
+      min-height: 480px;
+      max-height: 640px;
+      font-family: 'Inter', -apple-system, sans-serif;
+      background: var(--bg-primary);
+      color: var(--text-primary);
+      overflow-y: auto;
+      scrollbar-width: thin;
+      scrollbar-color: var(--bg-hover) transparent;
+    }
+    body::-webkit-scrollbar { width: 4px; }
+    body::-webkit-scrollbar-thumb { background: var(--bg-hover); border-radius: 4px; }
+    /* ── Header ──────────────────────────────────────────── */
+    .header {
+      display: flex; align-items: center; gap: 10px;
+      padding: 14px 20px 10px;
+      border-bottom: 1px solid var(--border);
+    }
+    .header-logo {
+      width: 28px; height: 28px;
+      background: linear-gradient(135deg, var(--accent), #7C6BDB);
+      border-radius: var(--radius-sm);
+      display: flex; align-items: center; justify-content: center;
+      font-size: 14px;
+    }
+    .header h1 { font-size: 15px; font-weight: 700; letter-spacing: -0.3px; }
+    .header h1 span { color: var(--accent); }
+    .header-badge {
+      margin-left: auto;
+      font-size: 10px; padding: 3px 8px;
+      background: var(--bg-card); border: 1px solid var(--border);
+      border-radius: 20px; color: var(--text-secondary); font-weight: 500;
+    }
+    /* ── URL Bar ──────────────────────────────────────────── */
+    .url-bar {
+      padding: 8px 20px;
+      background: var(--bg-secondary);
+      border-bottom: 1px solid var(--border);
+    }
+    .url-text {
+      font-size: 11px; color: var(--text-muted);
+      overflow: hidden; text-overflow: ellipsis; white-space: nowrap;
+      font-family: 'SF Mono', 'Fira Code', monospace;
+    }
+    /* ── Loading ──────────────────────────────────────────── */
+    .loading-container {
+      display: flex; flex-direction: column; align-items: center;
+      justify-content: center; padding: 40px 20px; gap: 14px;
+    }
+    .spinner {
+      width: 40px; height: 40px;
+      border: 3px solid var(--bg-hover);
+      border-top-color: var(--accent);
+      border-radius: 50%;
+      animation: spin 0.8s linear infinite;
+    }
+    @keyframes spin { to { transform: rotate(360deg); } }
+    .loading-text {
+      font-size: 13px; color: var(--text-secondary);
+      animation: pulse 1.5s ease-in-out infinite;
+    }
+    @keyframes pulse { 0%,100% { opacity: 1; } 50% { opacity: 0.5; } }
+    /* ── Result Panel ────────────────────────────────────── */
+    .result-panel { padding: 16px 20px; }
+    .result-hero {
+      display: flex; align-items: center; gap: 16px;
+      margin-bottom: 16px;
+    }
+    .score-ring-wrap {
+      position: relative; width: 80px; height: 80px; flex-shrink: 0;
+    }
+    .score-ring-bg, .score-ring-fg {
+      fill: none; stroke-width: 6;
+    }
+    .score-ring-bg { stroke: var(--bg-hover); }
+    .score-ring-fg {
+      stroke-linecap: round;
+      transform: rotate(-90deg); transform-origin: center;
+      transition: stroke-dashoffset 1s ease, stroke 0.5s;
+      stroke-dasharray: 213; stroke-dashoffset: 213;
+    }
+    .score-label {
+      position: absolute; inset: 0;
+      display: flex; flex-direction: column;
+      align-items: center; justify-content: center;
+    }
+    .score-pct { font-size: 20px; font-weight: 700; line-height: 1; }
+    .score-sub {
+      font-size: 9px; color: var(--text-muted);
+      margin-top: 2px; text-transform: uppercase; letter-spacing: 0.5px;
+    }
+    .shield-icon {
+      font-size: 28px;
+      animation: shieldPop 0.6s cubic-bezier(0.34, 1.56, 0.64, 1);
+    }
+    @keyframes shieldPop {
+      0% { transform: scale(0.3) rotate(-15deg); opacity: 0; }
+      60% { transform: scale(1.15) rotate(3deg); }
+      100% { transform: scale(1) rotate(0); opacity: 1; }
+    }
+    .result-verdict { flex: 1; }
+    .verdict-label { font-size: 16px; font-weight: 700; line-height: 1.2; }
+    .verdict-detail { font-size: 11px; color: var(--text-secondary); margin-top: 3px; }
+    .status-safe { color: var(--safe); }
+    .status-danger { color: var(--danger); }
+    .status-warn { color: var(--warning); }
+    /* ── Tier Rows ───────────────────────────────────────── */
+    .tier-section { margin-top: 4px; }
+    .tier-row {
+      background: var(--bg-card);
+      border: 1px solid var(--border);
+      border-radius: var(--radius-sm);
+      margin-bottom: 5px; overflow: hidden;
+      transition: border-color 0.2s;
+    }
+    .tier-row:hover { border-color: rgba(255,255,255,0.1); }
+    .tier-header {
+      display: flex; align-items: center;
+      padding: 8px 12px; cursor: pointer;
+      user-select: none; gap: 8px;
+    }
+    .tier-dot { width: 7px; height: 7px; border-radius: 50%; flex-shrink: 0; }
+    .tier-name { font-size: 11px; font-weight: 600; flex: 1; }
+    .tier-score {
+      font-size: 11px; font-weight: 600;
+      font-family: 'SF Mono', 'Fira Code', monospace;
+    }
+    .tier-chevron {
+      font-size: 9px; color: var(--text-muted);
+      transition: transform 0.2s;
+    }
+    .tier-row.open .tier-chevron { transform: rotate(180deg); }
+    .tier-body {
+      max-height: 0; overflow: hidden;
+      transition: max-height 0.3s ease; padding: 0 12px;
+    }
+    .tier-row.open .tier-body { max-height: 200px; padding: 4px 12px 10px; }
+    .tier-detail { font-size: 10px; color: var(--text-secondary); line-height: 1.6; }
+    .flag-badge {
+      display: inline-block; padding: 1px 6px;
+      background: rgba(239,68,68,0.12); color: var(--danger);
+      border-radius: 4px; font-size: 10px; margin: 2px 2px 2px 0;
+    }
+    /* ── Feedback Section ────────────────────────────────── */
+    .feedback-section {
+      padding: 0 20px 12px; margin-top: 8px;
+    }
+    .feedback-prompt {
+      font-size: 12px; color: var(--text-secondary);
+      margin-bottom: 8px; text-align: center;
+    }
+    .feedback-buttons { display: flex; gap: 8px; }
+    .fb-btn {
+      flex: 1; padding: 8px 0;
+      border: 1px solid var(--border); border-radius: var(--radius-sm);
+      background: var(--bg-card); color: var(--text-primary);
+      font-size: 13px; font-weight: 600; cursor: pointer;
+      transition: all 0.2s; font-family: inherit;
+    }
+    .fb-btn:hover { background: var(--bg-hover); }
+    .fb-btn-correct:hover { border-color: var(--safe); background: rgba(34,197,94,0.08); }
+    .fb-btn-wrong:hover { border-color: var(--danger); background: rgba(239,68,68,0.08); }
+    .fb-btn.selected {
+      opacity: 1 !important;
+    }
+    .fb-btn.dimmed {
+      opacity: 0.3; pointer-events: none;
+    }
+    .fb-btn-correct.selected {
+      border-color: var(--safe); background: rgba(34,197,94,0.15);
+      color: var(--safe);
+    }
+    .fb-btn-wrong.selected {
+      border-color: var(--danger); background: rgba(239,68,68,0.15);
+      color: var(--danger);
+    }
+    .thank-you {
+      display: none; text-align: center; padding: 10px;
+      font-size: 12px; color: var(--safe);
+      animation: slideDown 0.3s ease;
+    }
+    .thank-you.show { display: block; }
+    @keyframes slideDown {
+      from { opacity: 0; transform: translateY(-6px); }
+      to { opacity: 1; transform: translateY(0); }
+    }
+    /* ── Retraining Status ───────────────────────────────── */
+    .retrain-section {
+      padding: 8px 20px 12px;
+      border-top: 1px solid var(--border);
+      margin-top: 4px;
+    }
+    .retrain-row {
+      display: flex; align-items: center; gap: 6px;
+      font-size: 11px; color: var(--text-muted);
+      margin-bottom: 4px;
+    }
+    .retrain-row .icon { font-size: 12px; }
+    .retrain-progress {
+      height: 3px; background: var(--bg-hover);
+      border-radius: 2px; margin: 6px 0 4px;
+      overflow: hidden;
+    }
+    .retrain-progress-bar {
+      height: 100%; background: linear-gradient(90deg, var(--accent), #7C6BDB);
+      border-radius: 2px; transition: width 0.5s ease;
+    }
+    /* ── Session Stats ───────────────────────────────────── */
+    .stats-row {
+      display: flex; justify-content: space-between;
+      padding: 6px 20px;
+      border-top: 1px solid var(--border);
+      font-size: 11px; color: var(--text-muted);
+    }
+    /* ── Blocked Overlay ─────────────────────────────────── */
+    .blocked-overlay {
+      display: none; padding: 28px 24px; text-align: center;
+    }
+    .blocked-overlay.show {
+      display: flex; flex-direction: column;
+      align-items: center; gap: 10px;
+    }
+    .blocked-shield { font-size: 48px; animation: shieldPop 0.6s ease; }
+    .blocked-title { font-size: 18px; font-weight: 700; color: var(--danger); }
+    .blocked-url {
+      font-size: 11px; color: var(--text-muted);
+      word-break: break-all; max-width: 300px;
+    }
+    .blocked-method { font-size: 12px; color: var(--text-secondary); }
+    .proceed-btn {
+      margin-top: 8px; padding: 8px 20px;
+      background: transparent; border: 1px solid rgba(239,68,68,0.3);
+      border-radius: var(--radius-sm); color: var(--text-secondary);
+      font-size: 12px; cursor: pointer; font-family: inherit;
+      transition: all 0.2s;
+    }
+    .proceed-btn:hover {
+      background: rgba(239,68,68,0.08);
+      border-color: var(--danger); color: var(--danger);
+    }
+    .offline-banner {
+      display: none; padding: 8px 16px;
+      background: rgba(245, 158, 11, 0.08);
+      border: 1px solid rgba(245, 158, 11, 0.2);
+      border-radius: var(--radius-sm);
+      margin: 8px 20px 0; font-size: 11px;
+      color: var(--warning); text-align: center;
+    }
+    .offline-banner.show { display: block; }
+  </style>
+</head>
+<body>
+  <!-- Header -->
+  <div class="header">
+    <div class="header-logo">🛡️</div>
+    <h1>Phish<span>Guard</span> AI</h1>
+    <span class="header-badge" id="versionBadge">v3.0</span>
+  </div>
+  <!-- URL Bar -->
+  <div class="url-bar">
+    <div class="url-text" id="currentUrl">Analyzing...</div>
+  </div>
+  <!-- Server offline banner -->
+  <div class="offline-banner" id="offlineBanner">
+    ⚠️ Server offline — local heuristic only
+  </div>
+  <!-- Loading State -->
+  <div class="loading-container" id="loadingState">
+    <div class="spinner"></div>
+    <div class="loading-text">Analyzing with AI ensemble...</div>
+  </div>
+  <!-- Result Panel -->
+  <div class="result-panel" id="resultPanel" style="display:none;">
+    <div class="result-hero">
+      <div class="score-ring-wrap">
+        <svg width="80" height="80" viewBox="0 0 80 80">
+          <circle class="score-ring-bg" cx="40" cy="40" r="34" />
+          <circle class="score-ring-fg" id="scoreRing" cx="40" cy="40" r="34" />
+        </svg>
+        <div class="score-label">
+          <div class="score-pct" id="scorePct">0%</div>
+          <div class="score-sub" id="scoreSub">RISK</div>
+        </div>
+      </div>
+      <div style="display:flex; flex-direction:column; align-items:center; gap:4px">
+        <div class="shield-icon" id="shieldIcon">🛡️</div>
+      </div>
+      <div class="result-verdict">
+        <div class="verdict-label" id="verdictLabel">Analyzing</div>
+        <div class="verdict-detail" id="verdictDetail">Please wait...</div>
+      </div>
+    </div>
+    <!-- Tier Rows -->
+    <div class="tier-section" id="tierSection">
+      <div class="tier-row" data-tier="1">
+        <div class="tier-header" onclick="toggleTier(this)">
+          <div class="tier-dot" id="t1Dot"></div>
+          <div class="tier-name">Tier 1 · Whitelist</div>
+          <div class="tier-score" id="t1Score">—</div>
+          <div class="tier-chevron">▼</div>
+        </div>
+        <div class="tier-body"><div class="tier-detail" id="t1Detail">O(1) domain lookup</div></div>
+      </div>
+      <div class="tier-row" data-tier="2">
+        <div class="tier-header" onclick="toggleTier(this)">
+          <div class="tier-dot" id="t2Dot"></div>
+          <div class="tier-name">Tier 2 · Heuristics</div>
+          <div class="tier-score" id="t2Score">—</div>
+          <div class="tier-chevron">▼</div>
+        </div>
+        <div class="tier-body"><div class="tier-detail" id="t2Detail">15 regex/math signals</div></div>
+      </div>
+      <div class="tier-row" data-tier="3">
+        <div class="tier-header" onclick="toggleTier(this)">
+          <div class="tier-dot" id="t3Dot"></div>
+          <div class="tier-name">Tier 3 · BERT + GNN</div>
+          <div class="tier-score" id="t3Score">—</div>
+          <div class="tier-chevron">▼</div>
+        </div>
+        <div class="tier-body"><div class="tier-detail" id="t3Detail">Parallel NLP + graph analysis</div></div>
+      </div>
+      <div class="tier-row" data-tier="4">
+        <div class="tier-header" onclick="toggleTier(this)">
+          <div class="tier-dot" id="t4Dot"></div>
+          <div class="tier-name">Tier 4 · CNN Visual</div>
+          <div class="tier-score" id="t4Score">—</div>
+          <div class="tier-chevron">▼</div>
+        </div>
+        <div class="tier-body"><div class="tier-detail" id="t4Detail">Screenshot + brand detection</div></div>
+      </div>
+    </div>
+  </div>
+  <!-- Feedback Section -->
+  <div class="feedback-section" id="feedbackSection" style="display:none;">
+    <div class="feedback-prompt">Was this correct?</div>
+    <div class="feedback-buttons">
+      <button class="fb-btn fb-btn-correct" id="btnCorrect">👍 Correct</button>
+      <button class="fb-btn fb-btn-wrong" id="btnWrong">��� Incorrect</button>
+    </div>
+    <div class="thank-you" id="thankYou">✓ Thanks! Helps us improve 🎯</div>
+  </div>
+  <!-- Retraining Status -->
+  <div class="retrain-section" id="retrainSection" style="display:none;">
+    <div class="retrain-row">
+      <span class="icon">🔄</span>
+      <span id="retrainStatus">Next retrain: calculating...</span>
+    </div>
+    <div class="retrain-progress">
+      <div class="retrain-progress-bar" id="retrainProgressBar" style="width: 0%"></div>
+    </div>
+    <div class="retrain-row">
+      <span class="icon">📈</span>
+      <span id="retrainLast">No retraining yet</span>
+    </div>
+  </div>
+  <!-- Session Stats -->
+  <div class="stats-row" id="statsRow" style="display:none;">
+    <span id="statScanned">📊 0 scanned</span>
+    <span id="statFeedback">💬 0 feedback</span>
+    <span id="statVersion">🏷️ v0</span>
+  </div>
+  <!-- Blocked Page Overlay -->
+  <div class="blocked-overlay" id="blockedOverlay">
+    <div class="blocked-shield">🚨</div>
+    <div class="blocked-title">Phishing Detected!</div>
+    <div class="blocked-url" id="blockedUrl"></div>
+    <div class="blocked-method" id="blockedMethod"></div>
+    <button class="proceed-btn" id="proceedBtn">Proceed Anyway (Unsafe)</button>
+  </div>
+  <script src="popup.js"></script>
+</body>
+</html>

popup.js ADDED Viewed

	@@ -0,0 +1,332 @@

+// ============================================================
+// PhishGuard AI - popup.js
+// Popup logic: displays verdict, feedback buttons, retraining
+// status, and session stats.
+// ============================================================
+(function() {
+  "use strict";
+  // ── DOM Elements ──────────────────────────────────────────────
+  const $id = id => document.getElementById(id);
+  const loadingState    = $id("loadingState");
+  const resultPanel     = $id("resultPanel");
+  const feedbackSection = $id("feedbackSection");
+  const retrainSection  = $id("retrainSection");
+  const statsRow        = $id("statsRow");
+  const blockedOverlay  = $id("blockedOverlay");
+  const offlineBanner   = $id("offlineBanner");
+  let currentResult = null;
+  let currentUrlHash = null;
+  let feedbackGiven = false;
+  let feedbackTimeout = null;
+  let countdownInterval = null;
+  // ── Init ──────────────────────────────────────────────────────
+  async function init() {
+    // Check if this is a blocked page redirect
+    const params = new URLSearchParams(window.location.search);
+    if (params.get("blocked") === "1") {
+      showBlockedPage(params);
+      return;
+    }
+    // Get active tab
+    const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });
+    if (!tab?.url || !tab.url.startsWith("http")) {
+      showResult({
+        status: "safe", tier: 0, method: "internal",
+        confidence: 0, url: tab?.url || "N/A"
+      });
+      return;
+    }
+    $id("currentUrl").textContent = tab.url;
+    // Try per-tab cache first (instant)
+    chrome.runtime.sendMessage(
+      { type: "get_tab_result", tabId: tab.id },
+      response => {
+        if (response?.result) {
+          showResult(response.result);
+        } else {
+          // Fallback to chrome.storage
+          chrome.storage.local.get("lastResult", data => {
+            if (data.lastResult && data.lastResult.url === tab.url) {
+              showResult(data.lastResult);
+            } else {
+              loadingState.style.display = "flex";
+            }
+          });
+        }
+      }
+    );
+    // Load status
+    loadStatus();
+  }
+  // ── Show Result ───────────────────────────────────────────────
+  function showResult(result) {
+    currentResult = result;
+    loadingState.style.display = "none";
+    resultPanel.style.display = "block";
+    feedbackSection.style.display = "block";
+    retrainSection.style.display = "block";
+    statsRow.style.display = "flex";
+    const isBlocked = result.status === "blocked" || result.is_phishing;
+    const isWarn = !isBlocked && (result.confidence || 0) >= 0.4;
+    const confidence = Math.round((result.confidence || 0) * 100);
+    const tier = result.tier || 0;
+    // Score ring
+    const ring = $id("scoreRing");
+    const circumference = 2 * Math.PI * 34; // r=34
+    const offset = circumference - (confidence / 100) * circumference;
+    ring.style.strokeDasharray = circumference;
+    setTimeout(() => {
+      ring.style.strokeDashoffset = offset;
+      ring.style.stroke = isBlocked ? "var(--danger)" :
+                          isWarn ? "var(--warning)" : "var(--safe)";
+    }, 100);
+    $id("scorePct").textContent = confidence + "%";
+    $id("scorePct").className = `score-pct ${isBlocked ? "status-danger" : isWarn ? "status-warn" : "status-safe"}`;
+    $id("scoreSub").textContent = isBlocked ? "THREAT" : "RISK";
+    // Shield
+    $id("shieldIcon").textContent = isBlocked ? "🚨" : isWarn ? "⚠️" : "✅";
+    // Verdict text
+    $id("verdictLabel").textContent = isBlocked ? "PHISHING DETECTED" :
+                                     isWarn ? "SUSPICIOUS" : "SAFE";
+    $id("verdictLabel").className = `verdict-label ${isBlocked ? "status-danger" : isWarn ? "status-warn" : "status-safe"}`;
+    const methodNames = {
+      "whitelist": "Whitelist (Tier 1)",
+      "heuristic": "Heuristic Engine (Tier 2)",
+      "heuristic-fallback": "Heuristic Fallback",
+      "bert_gnn_ensemble": "BERT + GNN Ensemble (Tier 3)",
+      "full_ensemble_bert_gnn_cnn": "Full ML Ensemble (Tier 4)",
+      "ensemble_with_visual": "Ensemble + Visual (Tier 4)",
+      "user-override": "User Override",
+    };
+    const methodText = methodNames[result.method] || result.method || "Unknown";
+    $id("verdictDetail").textContent = `${methodText} · Confidence: ${confidence}%`;
+    // Tier dots and scores
+    updateTierRow(1, tier >= 1 ? "checked" : "pending", tier === 1 ? "SAFE ✓" : "Miss →");
+    updateTierRow(2, tier >= 2 ? (isBlocked && tier === 2 ? "blocked" : "checked") : "pending",
+      result.heuristic_score != null ? `${result.heuristic_score}/100` : "—");
+    updateTierRow(3, tier >= 3 ? (isBlocked && tier === 3 ? "blocked" : "checked") : "pending",
+      result.details?.tier3_score != null ? (result.details.tier3_score * 100).toFixed(0) + "%" : "—");
+    updateTierRow(4, tier >= 4 ? (isBlocked && tier === 4 ? "blocked" : "checked") : "pending",
+      result.details?.tier4_score != null ? (result.details.tier4_score * 100).toFixed(0) + "%" : "—");
+    // Tier 2 details — show triggered signals
+    if (result.signals && result.signals.length > 0) {
+      const badges = result.signals.map(s => `<span class="flag-badge">${s}</span>`).join(" ");
+      $id("t2Detail").innerHTML = `Signals triggered:<br>${badges}`;
+    }
+    // Compute URL hash for feedback
+    computeUrlHash(result.url);
+    // Check if we already gave feedback
+    checkExistingFeedback();
+  }
+  function updateTierRow(tier, status, scoreText) {
+    const dot = $id(`t${tier}Dot`);
+    const score = $id(`t${tier}Score`);
+    const colors = {
+      checked: "var(--safe)",
+      blocked: "var(--danger)",
+      pending: "var(--text-muted)",
+    };
+    dot.style.background = colors[status] || colors.pending;
+    score.textContent = scoreText;
+  }
+  // ── Blocked Page ──────────────────────────────────────────────
+  function showBlockedPage(params) {
+    loadingState.style.display = "none";
+    blockedOverlay.classList.add("show");
+    const url = decodeURIComponent(params.get("url") || "");
+    const score = params.get("score") || "0";
+    const method = decodeURIComponent(params.get("method") || "");
+    $id("blockedUrl").textContent = url;
+    $id("blockedMethod").textContent = `Detection: ${method} · Risk: ${score}%`;
+    $id("currentUrl").textContent = url;
+    currentResult = { url, status: "blocked", confidence: parseInt(score) / 100, method };
+    computeUrlHash(url);
+    // Show feedback for blocked pages too
+    feedbackSection.style.display = "block";
+    retrainSection.style.display = "block";
+    statsRow.style.display = "flex";
+    loadStatus();
+    $id("proceedBtn").onclick = () => {
+      chrome.runtime.sendMessage({ type: "whitelist_url", url }, () => {
+        chrome.tabs.update({ url });
+      });
+    };
+  }
+  // ── Feedback ──────────────────────────────────────────────────
+  async function computeUrlHash(url) {
+    if (!url) return;
+    const encoded = new TextEncoder().encode(url);
+    const hash = await crypto.subtle.digest("SHA-256", encoded);
+    currentUrlHash = Array.from(new Uint8Array(hash))
+      .map(b => b.toString(16).padStart(2, "0")).join("");
+  }
+  function submitFeedback(feedback) {
+    if (feedbackGiven || !currentUrlHash) return;
+    feedbackGiven = true;
+    chrome.runtime.sendMessage({
+      type: "submit_feedback",
+      url_hash: currentUrlHash,
+      feedback: feedback,
+    }, response => {
+      if (response?.success) {
+        // Highlight selected, dim other
+        const correctBtn = $id("btnCorrect");
+        const wrongBtn = $id("btnWrong");
+        if (feedback === "correct") {
+          correctBtn.classList.add("selected");
+          wrongBtn.classList.add("dimmed");
+        } else {
+          wrongBtn.classList.add("selected");
+          correctBtn.classList.add("dimmed");
+        }
+        $id("thankYou").classList.add("show");
+        // Allow changing within 5 minutes
+        feedbackTimeout = setTimeout(() => {
+          feedbackGiven = false;
+          correctBtn.classList.remove("selected", "dimmed");
+          wrongBtn.classList.remove("selected", "dimmed");
+        }, 5 * 60 * 1000);
+      }
+    });
+  }
+  function checkExistingFeedback() {
+    // Check if feedback was already given for this URL
+    chrome.storage.local.get("phishguard_feedback_queue", data => {
+      const queue = data.phishguard_feedback_queue || [];
+      const record = queue.find(r => r.url_hash === currentUrlHash);
+      if (record?.user_feedback) {
+        feedbackGiven = true;
+        const correctBtn = $id("btnCorrect");
+        const wrongBtn = $id("btnWrong");
+        if (record.user_feedback === "correct") {
+          correctBtn.classList.add("selected");
+          wrongBtn.classList.add("dimmed");
+        } else {
+          wrongBtn.classList.add("selected");
+          correctBtn.classList.add("dimmed");
+        }
+      }
+    });
+  }
+  // ── Retraining Status ─────────────────────────────────────────
+  function loadStatus() {
+    chrome.runtime.sendMessage({ type: "get_status" }, status => {
+      if (!status) return;
+      // Stats row
+      $id("statScanned").textContent = `📊 ${status.scan_count} scanned`;
+      $id("statFeedback").textContent = `💬 ${status.labeled_count || 0} labeled`;
+      $id("statVersion").textContent = `🏷️ v${status.model_version}`;
+      $id("versionBadge").textContent = `v${status.model_version || "3.0"}`;
+      // Retrain progress
+      const urlsRemaining = status.next_retrain_urls_remaining || 50;
+      const progress = Math.round(((50 - urlsRemaining) / 50) * 100);
+      $id("retrainProgressBar").style.width = `${progress}%`;
+      // Retrain status text
+      const timeMs = status.next_retrain_time_remaining_ms || 0;
+      const hours = Math.floor(timeMs / 3600000);
+      const mins = Math.floor((timeMs % 3600000) / 60000);
+      const labeledNeeded = status.min_labeled_needed || 0;
+      if (labeledNeeded > 0) {
+        $id("retrainStatus").textContent =
+          `Need ${labeledNeeded} more feedback to retrain`;
+      } else {
+        $id("retrainStatus").textContent =
+          `Next retrain: ${urlsRemaining} URLs or ${hours}h ${mins}m`;
+      }
+      // Last retrain info
+      if (status.last_retrain_ts) {
+        const ago = timeSince(new Date(status.last_retrain_ts));
+        $id("retrainLast").textContent = `Last retrain: ${ago} ago`;
+      }
+      // Start countdown
+      startCountdown(timeMs);
+    });
+  }
+  function startCountdown(initialMs) {
+    if (countdownInterval) clearInterval(countdownInterval);
+    let remaining = initialMs;
+    countdownInterval = setInterval(() => {
+      remaining -= 1000;
+      if (remaining <= 0) {
+        clearInterval(countdownInterval);
+        $id("retrainStatus").textContent = "Retrain pending...";
+        return;
+      }
+      const h = Math.floor(remaining / 3600000);
+      const m = Math.floor((remaining % 3600000) / 60000);
+      const s = Math.floor((remaining % 60000) / 1000);
+      // Only update the time portion if it's the time display
+      const el = $id("retrainStatus");
+      if (el.textContent.includes("URLs or")) {
+        const parts = el.textContent.split(" or ");
+        el.textContent = `${parts[0]} or ${h}h ${m}m ${s}s`;
+      }
+    }, 1000);
+  }
+  function timeSince(date) {
+    const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
+    if (seconds < 60) return `${seconds}s`;
+    if (seconds < 3600) return `${Math.floor(seconds / 60)}m`;
+    if (seconds < 86400) return `${Math.floor(seconds / 3600)}h`;
+    return `${Math.floor(seconds / 86400)}d`;
+  }
+  // ── Tier Row Toggle ───────────────────────────────────────────
+  window.toggleTier = function(header) {
+    const row = header.parentElement;
+    row.classList.toggle("open");
+  };
+  // ── Event Listeners ───────────────────────────────────────────
+  $id("btnCorrect").addEventListener("click", () => submitFeedback("correct"));
+  $id("btnWrong").addEventListener("click", () => submitFeedback("incorrect"));
+  // ── Start ─────────────────────────────────────────────────────
+  init();
+})();

render.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+# ============================================================
+# render.yaml — Render.com deployment config (architecture doc 5.2)
+#
+# PLAYWRIGHT ON RENDER:
+# To enable Tier 4 visual analysis (Playwright + Chromium), you need:
+#
+# 1. Add ENABLE_VISUAL_TIER=1 env var below
+# 2. Switch from python:3.10-slim to a full image in Dockerfile
+#    OR add Chromium system deps to the Dockerfile:
+#      apt-get install -y libnss3 libatk1.0-0 libatk-bridge2.0-0 \
+#        libcups2 libxkbcommon0 libgbm1 libpango-1.0-0 \
+#        libcairo2 libasound2 libxdamage1 libxrandr2 libxfixes3
+# 3. Add to Dockerfile after pip install:
+#      RUN pip install playwright && playwright install chromium
+#
+# NOTE: Playwright + Chromium adds ~400MB to the Docker image.
+#       On the free tier (512MB RAM), this may cause OOM.
+#       Only enable if you have a paid plan with >= 1GB RAM.
+# ============================================================
+services:
+  - type: web
+    name: phishguard-api
+    runtime: docker
+    dockerfilePath: ./Dockerfile
+    plan: free
+    healthCheckPath: /health
+    autoDeploy: true
+    envVars:
+      - key: PORT
+        value: "8000"
+      # Uncomment below to enable Tier 4 visual analysis (needs Playwright in Dockerfile)
+      # - key: ENABLE_VISUAL_TIER
+      #   value: "1"

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+fastapi==0.111.0
+uvicorn[standard]==0.29.0
+transformers==4.40.0
+torch==2.2.2
+torch-geometric==2.5.2
+torchvision==0.17.2
+playwright==1.44.0
+pillow==10.3.0
+scikit-learn==1.4.2
+pandas==2.2.2
+numpy==1.26.4
+httpx==0.27.0
+imagehash==4.3.1
+requests==2.31.0
+aiohttp==3.9.5
+aiofiles==23.2.1
+python-multipart==0.0.9
+apscheduler==3.10.4

retraining_service.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# ============================================================
+# PhishGuard AI - retraining_service.py
+# Incremental retraining service for all 3 ML models.
+#
+# Receives labeled feedback samples from the Chrome extension.
+# Runs parallel incremental updates for BERT, GNN, and CNN.
+# Tracks model version and accuracy deltas.
+# Supports hot-reload of all models without server restart.
+# ============================================================
+from __future__ import annotations
+import asyncio
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+logger = logging.getLogger("phishguard.retrain")
+DATA_DIR = Path(__file__).parent / "data"
+MODEL_VERSION_PATH = DATA_DIR / "model_version.json"
+@dataclass
+class FeedbackRecord:
+    """A single feedback record from the Chrome extension."""
+    url: str
+    verdict: str                        # "phishing" or "safe"
+    confidence: float = 0.0
+    tier_used: int = 0
+    heuristic_score: int = 0
+    signals: List[str] = field(default_factory=list)
+    user_feedback: Optional[str] = None  # "correct" or "incorrect"
+    timestamp: str = ""
+    feedback_ts: Optional[str] = None
+    url_hash: str = ""
+    session_id: str = ""
+@dataclass
+class RetrainResult:
+    """Result from a retraining run."""
+    status: str                                      # "success", "skipped", "error"
+    models_updated: List[str] = field(default_factory=list)
+    samples_used: int = 0
+    duration_seconds: float = 0.0
+    accuracy_delta: Dict[str, Optional[float]] = field(default_factory=dict)
+    next_retrain_hint: Dict = field(default_factory=dict)
+class RetrainingService:
+    """
+    Orchestrates incremental retraining for all 3 ML models.
+    Called by POST /retrain endpoint.
+    """
+    def __init__(
+        self,
+        bert_classifier,
+        gnn_inference,
+        cnn_inference,
+    ) -> None:
+        self._bert = bert_classifier
+        self._gnn = gnn_inference
+        self._cnn = cnn_inference
+        self._model_version = self._load_version()
+    def _load_version(self) -> int:
+        """Load current model version from disk."""
+        MODEL_VERSION_PATH.parent.mkdir(parents=True, exist_ok=True)
+        if MODEL_VERSION_PATH.exists():
+            try:
+                data = json.loads(MODEL_VERSION_PATH.read_text())
+                return data.get("version", 0)
+            except Exception:
+                pass
+        return 0
+    def _save_version(self, accuracy_delta: Dict[str, Optional[float]]) -> None:
+        """Save updated model version to disk."""
+        self._model_version += 1
+        data = {
+            "version": self._model_version,
+            "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+            "accuracy": accuracy_delta,
+        }
+        MODEL_VERSION_PATH.write_text(json.dumps(data, indent=2))
+    @property
+    def model_version(self) -> int:
+        return self._model_version
+    def get_version_info(self) -> dict:
+        """Get current model version info for GET /model_version."""
+        if MODEL_VERSION_PATH.exists():
+            try:
+                return json.loads(MODEL_VERSION_PATH.read_text())
+            except Exception:
+                pass
+        return {
+            "version": self._model_version,
+            "updated_at": None,
+            "accuracy": {},
+        }
+    async def retrain(
+        self,
+        samples: List[FeedbackRecord],
+    ) -> RetrainResult:
+        """
+        Perform incremental retraining on all models.
+        Steps:
+          1. Validate samples (min 10, URL format check)
+          2. Separate by tier_used for targeted updates
+          3. Run BERT + GNN updates in parallel
+          4. Run CNN update if Tier 4 samples exist
+          5. Compute accuracy_delta for each model
+          6. Increment model version
+          7. Hot-reload all models
+        Returns RetrainResult with status and deltas.
+        """
+        start_time = time.time()
+        # 1. Validate
+        valid_samples = self._validate_samples(samples)
+        if len(valid_samples) < 10:
+            return RetrainResult(
+                status="skipped",
+                samples_used=len(valid_samples),
+                next_retrain_hint={
+                    "recommended_trigger": "count",
+                    "min_samples_needed": 10 - len(valid_samples),
+                },
+            )
+        # 2. Convert to (url, label) pairs
+        url_label_pairs: List[Tuple[str, int]] = []
+        tier4_pairs: List[Tuple[str, int]] = []
+        for sample in valid_samples:
+            # Determine the true label based on user feedback
+            if sample.user_feedback == "correct":
+                label = 1 if sample.verdict == "phishing" else 0
+            elif sample.user_feedback == "incorrect":
+                label = 0 if sample.verdict == "phishing" else 1
+            else:
+                continue
+            url_label_pairs.append((sample.url, label))
+            if sample.tier_used == 4:
+                tier4_pairs.append((sample.url, label))
+        if len(url_label_pairs) < 5:
+            return RetrainResult(
+                status="skipped",
+                samples_used=len(url_label_pairs),
+                next_retrain_hint={
+                    "recommended_trigger": "count",
+                    "min_samples_needed": 5,
+                },
+            )
+        # 3. Run updates
+        models_updated: List[str] = []
+        accuracy_delta: Dict[str, Optional[float]] = {}
+        try:
+            # BERT + GNN in parallel
+            loop = asyncio.get_event_loop()
+            bert_task = loop.run_in_executor(
+                None,
+                self._bert.incremental_update,
+                url_label_pairs,
+            )
+            gnn_task = loop.run_in_executor(
+                None,
+                self._gnn.incremental_update,
+                url_label_pairs,
+            )
+            bert_delta, gnn_delta = await asyncio.gather(
+                bert_task, gnn_task,
+                return_exceptions=True,
+            )
+            # Process BERT result
+            if isinstance(bert_delta, Exception):
+                logger.error(f"BERT update error: {bert_delta}")
+                accuracy_delta["bert"] = None
+            elif bert_delta is not None:
+                accuracy_delta["bert"] = bert_delta
+                models_updated.append("bert")
+            else:
+                accuracy_delta["bert"] = None
+            # Process GNN result
+            if isinstance(gnn_delta, Exception):
+                logger.error(f"GNN update error: {gnn_delta}")
+                accuracy_delta["gnn"] = None
+            elif gnn_delta is not None:
+                accuracy_delta["gnn"] = gnn_delta
+                models_updated.append("gnn")
+            else:
+                accuracy_delta["gnn"] = None
+            # 4. CNN update (only if Tier 4 samples exist)
+            if tier4_pairs:
+                try:
+                    cnn_delta = await self._cnn.incremental_update(tier4_pairs)
+                    if cnn_delta is not None:
+                        accuracy_delta["cnn"] = cnn_delta
+                        models_updated.append("cnn")
+                    else:
+                        accuracy_delta["cnn"] = None
+                except Exception as e:
+                    logger.error(f"CNN update error: {e}")
+                    accuracy_delta["cnn"] = None
+            else:
+                accuracy_delta["cnn"] = None
+            # 5. Update version
+            if models_updated:
+                self._save_version(accuracy_delta)
+            # 6. Hot-reload
+            await self._hot_reload(models_updated)
+            duration = time.time() - start_time
+            return RetrainResult(
+                status="success" if models_updated else "skipped",
+                models_updated=models_updated,
+                samples_used=len(url_label_pairs),
+                duration_seconds=round(duration, 2),
+                accuracy_delta=accuracy_delta,
+                next_retrain_hint={
+                    "recommended_trigger": "count",
+                    "min_samples_needed": 10,
+                },
+            )
+        except Exception as e:
+            logger.error(f"Retraining failed: {e}")
+            return RetrainResult(
+                status="error",
+                duration_seconds=round(time.time() - start_time, 2),
+                accuracy_delta=accuracy_delta,
+            )
+    def _validate_samples(self, samples: List[FeedbackRecord]) -> List[FeedbackRecord]:
+        """Validate and filter feedback samples."""
+        valid = []
+        for s in samples:
+            # Must have user feedback
+            if not s.user_feedback:
+                continue
+            if s.user_feedback not in ("correct", "incorrect"):
+                continue
+            # Must have a valid URL
+            if not s.url or not s.url.startswith(("http://", "https://")):
+                continue
+            valid.append(s)
+        return valid
+    async def _hot_reload(self, models: List[str]) -> None:
+        """Hot-reload updated models in-memory."""
+        if "bert" in models:
+            try:
+                bert_weights = Path(__file__).parent / "bert_weights"
+                if bert_weights.exists():
+                    self._bert.load_local(bert_weights)
+                    logger.info("BERT hot-reloaded")
+            except Exception as e:
+                logger.error(f"BERT hot-reload failed: {e}")
+        if "gnn" in models:
+            try:
+                self._gnn.reload()
+                logger.info("GNN hot-reloaded")
+            except Exception as e:
+                logger.error(f"GNN hot-reload failed: {e}")
+        if "cnn" in models:
+            try:
+                self._cnn.reload()
+                logger.info("CNN hot-reloaded")
+            except Exception as e:
+                logger.error(f"CNN hot-reload failed: {e}")

screenshot_collector.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# ============================================================
+# PhishGuard AI - screenshot_collector.py
+# Batch screenshot capture for CNN training data generation.
+#
+# Uses Playwright async API with 10 concurrent captures.
+# Blocks fonts, media, video for 60-70% speedup.
+# Saves PNG named by URL SHA256 hash.
+# ============================================================
+from __future__ import annotations
+import asyncio
+import hashlib
+import logging
+import sys
+from pathlib import Path
+from typing import List
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)-7s | %(message)s",
+)
+logger = logging.getLogger("phishguard.screenshot_collector")
+BACKEND_DIR = Path(__file__).parent
+DATA_DIR = BACKEND_DIR / "data"
+SCREENSHOTS_DIR = DATA_DIR / "screenshots"
+def url_to_filename(url: str) -> str:
+    """Convert URL to safe filename using SHA256 hash."""
+    url_hash = hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
+    return f"{url_hash}.png"
+async def capture_single(
+    url: str,
+    save_dir: Path,
+    semaphore: asyncio.Semaphore,
+    browser,
+) -> bool:
+    """Capture a single screenshot with concurrency limiting."""
+    async with semaphore:
+        filename = url_to_filename(url)
+        filepath = save_dir / filename
+        # Skip if already captured
+        if filepath.exists():
+            return True
+        try:
+            page = await browser.new_page(
+                viewport={"width": 1280, "height": 800},
+                user_agent=(
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) "
+                    "Chrome/120.0.0.0 Safari/537.36"
+                ),
+            )
+            # Block heavy resources for speed
+            await page.route(
+                "**/*.{woff,woff2,ttf,eot,mp4,webm,ogg,avi,mp3,wav,flac}",
+                lambda route: route.abort(),
+            )
+            await page.goto(
+                url,
+                wait_until="domcontentloaded",
+                timeout=10000,
+            )
+            # Brief wait for rendering
+            await asyncio.sleep(0.5)
+            screenshot = await page.screenshot(type="png")
+            filepath.write_bytes(screenshot)
+            await page.close()
+            return True
+        except Exception as e:
+            logger.debug(f"Screenshot failed for {url}: {e}")
+            try:
+                await page.close()
+            except Exception:
+                pass
+            return False
+async def batch_capture(
+    urls: List[str],
+    save_dir: Path,
+    concurrency: int = 10,
+    label: str = "urls",
+) -> int:
+    """
+    Capture screenshots for a batch of URLs concurrently.
+    Returns count of successful captures.
+    """
+    save_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        from playwright.async_api import async_playwright
+    except ImportError:
+        logger.error("Playwright not installed. Run: pip install playwright && playwright install chromium")
+        return 0
+    semaphore = asyncio.Semaphore(concurrency)
+    success_count = 0
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        tasks = [
+            capture_single(url, save_dir, semaphore, browser)
+            for url in urls
+        ]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        for i, result in enumerate(results):
+            if result is True:
+                success_count += 1
+            if (i + 1) % 50 == 0:
+                logger.info(f"   {label}: {i+1}/{len(urls)} processed ({success_count} captured)")
+        await browser.close()
+    logger.info(f"   {label}: {success_count}/{len(urls)} screenshots captured")
+    return success_count
+async def collect_training_screenshots(
+    phish_count: int = 10,
+    legit_count: int = 10,
+) -> None:
+    """Collect screenshots for CNN training."""
+    from data_collector import download_phishtank, download_tranco
+    phishing_dir = SCREENSHOTS_DIR / "phishing"
+    legitimate_dir = SCREENSHOTS_DIR / "legitimate"
+    # Download URL lists
+    print("📥 Loading URL lists...")
+    phish_urls = download_phishtank(max_urls=phish_count)[:phish_count]
+    legit_urls = download_tranco(n=legit_count)[:legit_count]
+    print(f"\n📸 Capturing phishing screenshots ({len(phish_urls)} URLs)...")
+    phish_success = await batch_capture(phish_urls, phishing_dir, label="Phishing")
+    print(f"\n📸 Capturing legitimate screenshots ({len(legit_urls)} URLs)...")
+    legit_success = await batch_capture(legit_urls, legitimate_dir, label="Legitimate")
+    print(f"\n✅ Screenshots collected:")
+    print(f"   Phishing:   {phish_success}/{len(phish_urls)}")
+    print(f"   Legitimate: {legit_success}/{len(legit_urls)}")
+    print(f"   Saved to:   {SCREENSHOTS_DIR}")
+def main() -> None:
+    print("=" * 60)
+    print("PhishGuard AI ��� Screenshot Collection")
+    print("=" * 60)
+    asyncio.run(collect_training_screenshots())
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

screenshot_hasher.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# ============================================================
+# PhishGuard AI - cnn/screenshot_hasher.py
+# Perceptual hash-based brand impersonation detector.
+#
+# Compares webpage screenshots against reference hashes of
+# known brand login pages using imagehash.phash.
+#
+# brand_boost = 0.25 if hamming_distance < 10 else 0.0
+# ============================================================
+from __future__ import annotations
+import io
+import json
+import logging
+from pathlib import Path
+from typing import Tuple, Optional, Dict, List
+from PIL import Image
+logger = logging.getLogger("phishguard.cnn.hasher")
+# ── Try to use imagehash, fall back to custom implementation ─────────
+_imagehash_available = False
+try:
+    import imagehash
+    _imagehash_available = True
+except ImportError:
+    logger.info("imagehash not installed — using built-in phash")
+HASH_DB_PATH = Path(__file__).parent / "brand_hashes.json"
+class BrandHashDetector:
+    """
+    Perceptual hash-based brand impersonation detector.
+    Compares screenshots against reference hashes of 10 major brands.
+    """
+    BRANDS: List[str] = [
+        "paypal", "google", "apple", "microsoft", "amazon",
+        "chase", "netflix", "facebook", "instagram", "wellsfargo",
+    ]
+    BRAND_DOMAINS: Dict[str, str] = {
+        "paypal": "paypal.com",
+        "google": "google.com",
+        "apple": "apple.com",
+        "microsoft": "microsoft.com",
+        "amazon": "amazon.com",
+        "chase": "chase.com",
+        "netflix": "netflix.com",
+        "facebook": "facebook.com",
+        "instagram": "instagram.com",
+        "wellsfargo": "wellsfargo.com",
+    }
+    def __init__(self, hash_db_path: Optional[Path] = None) -> None:
+        self._hash_db_path = hash_db_path or HASH_DB_PATH
+        self._reference_hashes: Dict[str, dict] = {}
+        self._load_reference_hashes()
+    def _load_reference_hashes(self) -> None:
+        """Load reference hashes from JSON database."""
+        if self._hash_db_path.exists():
+            try:
+                with open(self._hash_db_path) as f:
+                    self._reference_hashes = json.load(f)
+                logger.info(f"Loaded {len(self._reference_hashes)} brand hashes")
+            except Exception as e:
+                logger.warning(f"Failed to load brand hashes: {e}")
+                self._reference_hashes = {}
+        else:
+            logger.info("No brand hash DB found — brand detection disabled")
+            self._reference_hashes = {}
+    def compute_hash(self, img_bytes: bytes, hash_size: int = 16) -> Optional[int]:
+        """
+        Compute perceptual hash of an image.
+        Uses imagehash.phash if available, otherwise custom DCT-less implementation.
+        """
+        try:
+            img = Image.open(io.BytesIO(img_bytes))
+            if _imagehash_available:
+                h = imagehash.phash(img, hash_size=hash_size)
+                return int(str(h), 16)
+            else:
+                return self._custom_phash(img, hash_size)
+        except Exception as e:
+            logger.warning(f"Hash computation failed: {e}")
+            return None
+    def _custom_phash(self, img: Image.Image, hash_size: int = 16) -> int:
+        """Fallback perceptual hash (mean-based, no DCT)."""
+        img = img.convert("L").resize((hash_size, hash_size), Image.LANCZOS)
+        pixels = list(img.getdata())
+        avg = sum(pixels) / len(pixels)
+        bits = "".join("1" if p > avg else "0" for p in pixels)
+        return int(bits, 2)
+    def hamming_distance(self, h1: int, h2: int) -> int:
+        """Count bit differences between two hashes. 0 = identical."""
+        return bin(h1 ^ h2).count("1")
+    def detect(
+        self,
+        screenshot_bytes: bytes,
+        url: str = "",
+        threshold: int = 10,
+    ) -> Tuple[bool, str, float]:
+        """
+        Detect brand impersonation from screenshot.
+        Returns:
+            (is_impersonation, brand_name, confidence)
+            is_impersonation: True if page looks like a brand but URL doesn't match
+            brand_name: detected brand name or ""
+            confidence: 0.0-1.0 similarity score
+        """
+        page_hash = self.compute_hash(screenshot_bytes)
+        if page_hash is None:
+            return False, "", 0.0
+        url_lower = url.lower()
+        best_match: Optional[str] = None
+        best_distance = 999
+        best_confidence = 0.0
+        for brand, entry in self._reference_hashes.items():
+            try:
+                stored_hash = int(entry["hash"])
+                distance = self.hamming_distance(page_hash, stored_hash)
+                confidence = max(0.0, 1.0 - distance / 256.0)
+                if distance < best_distance:
+                    best_distance = distance
+                    best_match = brand
+                    best_confidence = confidence
+            except (ValueError, KeyError):
+                continue
+        if best_match and best_distance <= threshold:
+            legit_domain = self.BRAND_DOMAINS.get(best_match, f"{best_match}.com")
+            # Check if URL belongs to legitimate domain
+            if legit_domain not in url_lower:
+                return True, best_match, best_confidence
+            else:
+                return False, best_match, best_confidence
+        return False, "", 0.0
+    def register_brand(
+        self,
+        brand_name: str,
+        domain: str,
+        screenshot_bytes: bytes,
+    ) -> bool:
+        """Register a brand's reference screenshot hash."""
+        h = self.compute_hash(screenshot_bytes)
+        if h is None:
+            return False
+        self._reference_hashes[brand_name] = {
+            "domain": domain,
+            "hash": str(h),
+        }
+        # Save to disk
+        try:
+            with open(self._hash_db_path, "w") as f:
+                json.dump(self._reference_hashes, f, indent=2)
+            logger.info(f"Registered brand: {brand_name} ({domain})")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to save brand hash: {e}")
+            return False
+# ── Legacy compatibility ─────────────────────────────────────────────
+_detector = BrandHashDetector()
+def check_brand_impersonation(
+    screenshot_bytes: bytes,
+    url: str,
+    similarity_threshold: int = 10,
+) -> dict:
+    """Legacy wrapper for backward compatibility."""
+    is_impersonation, brand, confidence = _detector.detect(
+        screenshot_bytes, url, similarity_threshold,
+    )
+    if is_impersonation:
+        return {
+            "impersonation_detected": True,
+            "impersonated_brand": brand,
+            "legitimate_domain": _detector.BRAND_DOMAINS.get(brand, ""),
+            "visual_similarity": round(confidence, 3),
+        }
+    elif brand:
+        return {
+            "impersonation_detected": False,
+            "matched_brand": brand,
+            "note": "legitimate site",
+        }
+    else:
+        return {
+            "impersonation_detected": False,
+            "reason": "no_brand_match",
+        }

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test_endpoint.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import asyncio
+from main import analyze_email_endpoint, EmailRequest
+import json
+async def run_tests():
+    print("--- Testing Tier 1: Whitelist ---")
+    res1 = await analyze_email_endpoint(EmailRequest(
+        sender="noreply@github.com",
+        subject="Your receipt",
+        body="...",
+        urls=[]
+    ))
+    print(json.dumps(res1, indent=2))
+    print("\n--- Testing Tier 2: Text Heuristic (No URLs) ---")
+    res2 = await analyze_email_endpoint(EmailRequest(
+        sender="admin@unknown-domain.com",
+        subject="URGENT: Password Reset Required",
+        body="Please reset immediately.",
+        urls=[]
+    ))
+    print(json.dumps(res2, indent=2))
+    print("\n--- Testing Tier 2: Async URLs ---")
+    res3 = await analyze_email_endpoint(EmailRequest(
+        sender="service@paypal-update.net",
+        subject="Important Account Update",
+        body="Click the link to verify your account.",
+        urls=["http://chase-bank-verify-login.cx/auth", "http://google.com/"]
+    ))
+    print(json.dumps(res3, indent=2))
+if __name__ == "__main__":
+    asyncio.run(run_tests())

tier3_bert_gnn.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# ============================================================
+# PhishGuard AI - tier3_bert_gnn.py
+# Tier 3: BERT + GNN Parallel Ensemble
+#
+# Triggered only when Tier 2 score < 80.
+# BERT and GNN run in PARALLEL via asyncio.gather + run_in_executor.
+#
+# Ensemble formula:
+#   P3 = 0.45·P_bert + 0.35·P_gnn + 0.20·(H_score/100)
+#
+# Decision:
+#   P3 >= 0.85 → BLOCK
+#   P3 < 0.40  → SAFE
+#   0.40 <= P3 < 0.85 → escalate to Tier 4
+# ============================================================
+from __future__ import annotations
+import asyncio
+import logging
+from typing import Optional
+logger = logging.getLogger("phishguard.tier3")
+class Tier3Ensemble:
+    """
+    Tier 3: BERT + GNN parallel ensemble classifier.
+    Runs BERT and GNN inference in parallel using asyncio.gather
+    with run_in_executor for non-blocking thread pool execution.
+    """
+    # Ensemble weights
+    W_BERT: float = 0.45
+    W_GNN: float = 0.35
+    W_HEURISTIC: float = 0.20
+    def __init__(
+        self,
+        bert_classifier,
+        gnn_inference,
+    ) -> None:
+        self._bert = bert_classifier
+        self._gnn = gnn_inference
+    async def predict(
+        self,
+        url: str,
+        title: str = "",
+        snippet: str = "",
+        h_score: int = 0,
+    ) -> float:
+        """
+        Run BERT + GNN in parallel and compute ensemble score.
+        Args:
+            url: The URL to analyze
+            title: Page title (optional)
+            snippet: Page content snippet (optional)
+            h_score: Heuristic score from Tier 2 (0-100, passed through, NOT recomputed)
+        Returns:
+            P3 ∈ [0,1] — ensemble phishing probability
+        """
+        loop = asyncio.get_event_loop()
+        # Run BERT and GNN in parallel (both are CPU-bound, use thread pool)
+        bert_task = self._bert_predict(url, title, snippet, loop)
+        gnn_task = self._gnn_predict(url, loop)
+        p_bert, p_gnn = await asyncio.gather(bert_task, gnn_task)
+        # Ensemble: P3 = 0.45·P_bert + 0.35·P_gnn + 0.20·H_norm
+        h_norm = h_score / 100.0
+        p3 = (self.W_BERT * p_bert) + (self.W_GNN * p_gnn) + (self.W_HEURISTIC * h_norm)
+        logger.info(
+            f"Tier3 ensemble | url={url[:60]} | "
+            f"P_bert={p_bert:.4f} P_gnn={p_gnn:.4f} H_norm={h_norm:.4f} → P3={p3:.4f}"
+        )
+        return round(min(max(p3, 0.0), 1.0), 4)
+    async def _bert_predict(
+        self,
+        url: str,
+        title: str,
+        snippet: str,
+        loop: asyncio.AbstractEventLoop,
+    ) -> float:
+        """
+        Run BERT inference in thread pool (non-blocking).
+        Returns P_bert ∈ [0,1].
+        """
+        try:
+            p_bert = await asyncio.wait_for(
+                loop.run_in_executor(
+                    None,  # Default thread pool
+                    self._bert.predict,
+                    url,
+                    title,
+                    snippet,
+                ),
+                timeout=10.0,
+            )
+            return float(p_bert)
+        except asyncio.TimeoutError:
+            logger.warning(f"BERT timeout for {url[:50]}")
+            return 0.5  # Neutral on timeout
+        except Exception as e:
+            logger.error(f"BERT predict error: {e}")
+            return 0.5
+    async def _gnn_predict(
+        self,
+        url: str,
+        loop: asyncio.AbstractEventLoop,
+    ) -> float:
+        """
+        Run GNN inference in thread pool (non-blocking).
+        Returns P_gnn ∈ [0,1].
+        """
+        try:
+            p_gnn = await asyncio.wait_for(
+                loop.run_in_executor(
+                    None,
+                    self._gnn.predict,
+                    url,
+                    None,  # related_urls
+                ),
+                timeout=5.0,
+            )
+            return float(p_gnn)
+        except asyncio.TimeoutError:
+            logger.warning(f"GNN timeout for {url[:50]}")
+            return 0.5
+        except Exception as e:
+            logger.error(f"GNN predict error: {e}")
+            return 0.5
+    @staticmethod
+    def decide(p3: float) -> str:
+        """
+        Make decision based on P3 score.
+        Returns: 'block', 'safe', or 'escalate'
+        """
+        if p3 >= 0.85:
+            return "block"
+        elif p3 < 0.40:
+            return "safe"
+        else:
+            return "escalate"

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

train_cnn.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# ============================================================
+# PhishGuard AI - cnn/train_cnn.py
+# CNN fine-tuning script for phishing screenshot detection.
+#
+# Loads data/screenshots/ with ImageFolder structure
+# Augmentation: RandomHorizontalFlip, ColorJitter, RandomRotation
+# 15 epochs, AdamW on head only (backbone stays frozen)
+# Saves cnn_weights.pt + cnn_replay_buffer.pt
+# Works with as few as 100 images per class
+# ============================================================
+from __future__ import annotations
+import logging
+import sys
+from pathlib import Path
+from typing import List
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)-7s | %(message)s",
+)
+logger = logging.getLogger("phishguard.cnn.train")
+CNN_DIR = Path(__file__).parent
+BACKEND_DIR = CNN_DIR.parent
+WEIGHTS_PATH = CNN_DIR / "cnn_weights.pt"
+REPLAY_BUFFER_PATH = BACKEND_DIR / "data" / "cnn_replay_buffer.pt"
+SCREENSHOTS_DIR = BACKEND_DIR / "data" / "screenshots"
+sys.path.insert(0, str(CNN_DIR))
+sys.path.insert(0, str(BACKEND_DIR))
+def main() -> None:
+    print("=" * 60)
+    print("PhishGuard AI — CNN Training")
+    print("=" * 60)
+    import torch
+    import torch.nn as nn
+    from torch.optim import AdamW
+    from torch.utils.data import DataLoader, Dataset, random_split
+    import torchvision.transforms as T
+    from PIL import Image
+    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+    from cnn_model import PhishCNN
+    # ── Check data ───────────────────────────────────────────────
+    phishing_dir = SCREENSHOTS_DIR / "phishing"
+    legitimate_dir = SCREENSHOTS_DIR / "legitimate"
+    if not phishing_dir.exists() or not legitimate_dir.exists():
+        print(f"\n⚠️  Screenshot directories not found:")
+        print(f"   Expected: {phishing_dir}")
+        print(f"   Expected: {legitimate_dir}")
+        print(f"\n   Run: python screenshot_collector.py")
+        # Create dirs and generate placeholder images for testing
+        phishing_dir.mkdir(parents=True, exist_ok=True)
+        legitimate_dir.mkdir(parents=True, exist_ok=True)
+        print("   Generating synthetic training images...")
+        _generate_synthetic_screenshots(phishing_dir, legitimate_dir)
+    phishing_files = list(phishing_dir.glob("*.png")) + list(phishing_dir.glob("*.jpg"))
+    legit_files = list(legitimate_dir.glob("*.png")) + list(legitimate_dir.glob("*.jpg"))
+    print(f"\n📊 Dataset:")
+    print(f"   Phishing screenshots: {len(phishing_files)}")
+    print(f"   Legitimate screenshots: {len(legit_files)}")
+    if len(phishing_files) < 10 or len(legit_files) < 10:
+        print("⚠️  Too few screenshots. Generating synthetic images...")
+        _generate_synthetic_screenshots(phishing_dir, legitimate_dir, count=100)
+        phishing_files = list(phishing_dir.glob("*.png"))
+        legit_files = list(legitimate_dir.glob("*.png"))
+        print(f"   Phishing: {len(phishing_files)}, Legitimate: {len(legit_files)}")
+    # ── Dataset ──────────────────────────────────────────────────
+    train_transform = T.Compose([
+        T.Resize((224, 224)),
+        T.RandomHorizontalFlip(),
+        T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
+        T.RandomRotation(5),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    val_transform = T.Compose([
+        T.Resize((224, 224)),
+        T.ToTensor(),
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    class ScreenshotDataset(Dataset):
+        def __init__(self, files: List[Path], label: int, transform):
+            self.files = files
+            self.label = label
+            self.transform = transform
+        def __len__(self) -> int:
+            return len(self.files)
+        def __getitem__(self, idx: int):
+            try:
+                img = Image.open(self.files[idx]).convert("RGB")
+                tensor = self.transform(img)
+                return tensor, self.label
+            except Exception:
+                # Return black image on error
+                tensor = torch.zeros(3, 224, 224)
+                return tensor, self.label
+    # Split: 80% train, 20% val
+    import random
+    random.shuffle(phishing_files)
+    random.shuffle(legit_files)
+    phish_split = int(len(phishing_files) * 0.8)
+    legit_split = int(len(legit_files) * 0.8)
+    train_phish = phishing_files[:phish_split]
+    val_phish = phishing_files[phish_split:]
+    train_legit = legit_files[:legit_split]
+    val_legit = legit_files[legit_split:]
+    train_dataset = (
+        ScreenshotDataset(train_phish, 1, train_transform)
+        + ScreenshotDataset(train_legit, 0, train_transform)
+    )
+    val_dataset = (
+        ScreenshotDataset(val_phish, 1, val_transform)
+        + ScreenshotDataset(val_legit, 0, val_transform)
+    )
+    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
+    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
+    # ── Model ────────────────────────────────────────────────────
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"\n🤖 Device: {device}")
+    model = PhishCNN(pretrained=True).to(device)
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    print(f"   Parameters: {total:,} total, {trainable:,} trainable")
+    # Only optimize head parameters
+    head_params = [p for p in model.backbone.fc.parameters() if p.requires_grad]
+    optimizer = AdamW(head_params, lr=1e-3, weight_decay=1e-4)
+    loss_fn = nn.BCELoss()
+    # ── Training ─────────────────────────────────────────────────
+    EPOCHS = 2
+    best_val_acc = 0.0
+    print(f"\n🏋️ Training for {EPOCHS} epochs...")
+    print(f"   {'Epoch':>5} | {'Loss':>8} | {'Train Acc':>9} | {'Val Acc':>7}")
+    print(f"   {'─'*5} | {'─'*8} | {'─'*9} | {'─'*7}")
+    for epoch in range(1, EPOCHS + 1):
+        # Train
+        model.train()
+        total_loss = 0.0
+        train_preds, train_labels = [], []
+        for batch_x, batch_y in train_loader:
+            batch_x = batch_x.to(device)
+            batch_y = batch_y.float().to(device)
+            optimizer.zero_grad()
+            output = model(batch_x).squeeze()
+            loss = loss_fn(output, batch_y)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            preds = (output >= 0.5).int()
+            train_preds.extend(preds.cpu().tolist())
+            train_labels.extend(batch_y.int().cpu().tolist())
+        avg_loss = total_loss / max(len(train_loader), 1)
+        train_acc = accuracy_score(train_labels, train_preds) if train_labels else 0.0
+        # Validate
+        model.eval()
+        val_preds, val_labels = [], []
+        with torch.no_grad():
+            for batch_x, batch_y in val_loader:
+                batch_x = batch_x.to(device)
+                batch_y = batch_y.float().to(device)
+                output = model(batch_x).squeeze()
+                preds = (output >= 0.5).int()
+                val_preds.extend(preds.cpu().tolist())
+                val_labels.extend(batch_y.int().cpu().tolist())
+        val_acc = accuracy_score(val_labels, val_preds) if val_labels else 0.0
+        if epoch % 3 == 0 or epoch == 1:
+            print(f"   {epoch:>5} | {avg_loss:>8.4f} | {train_acc:>9.4f} | {val_acc:>7.4f}")
+        if val_acc > best_val_acc:
+            best_val_acc = val_acc
+            torch.save(model.state_dict(), WEIGHTS_PATH)
+    # ── Final metrics ────────────────────────────────────────────
+    if val_labels:
+        precision, recall, f1, _ = precision_recall_fscore_support(
+            val_labels, val_preds, average="binary", zero_division=0,
+        )
+        print(f"\n📊 Final Validation:")
+        print(f"   Accuracy:  {best_val_acc:.4f}")
+        print(f"   Precision: {precision:.4f}")
+        print(f"   Recall:    {recall:.4f}")
+        print(f"   F1 Score:  {f1:.4f}")
+    # ── Save replay buffer ───────────────────────────────────────
+    all_paths = phishing_files + legit_files
+    replay_paths = [str(p) for p in all_paths[:100]]
+    replay_labels = [1] * min(len(phishing_files), 50) + [0] * min(len(legit_files), 50)
+    REPLAY_BUFFER_PATH.parent.mkdir(parents=True, exist_ok=True)
+    torch.save({"paths": replay_paths, "labels": replay_labels}, REPLAY_BUFFER_PATH)
+    print(f"\n✅ CNN weights saved to: {WEIGHTS_PATH}")
+    print(f"💾 Replay buffer saved: {len(replay_paths)} paths → {REPLAY_BUFFER_PATH}")
+    print("=" * 60)
+def _generate_synthetic_screenshots(
+    phishing_dir: Path,
+    legitimate_dir: Path,
+    count: int = 100,
+) -> None:
+    """Generate synthetic screenshots for training when real data unavailable."""
+    import random
+    from PIL import Image, ImageDraw, ImageFont
+    for label, save_dir, colors in [
+        ("phishing", phishing_dir, [(200, 50, 50), (180, 30, 30), (220, 80, 60)]),
+        ("legitimate", legitimate_dir, [(50, 120, 200), (30, 100, 180), (60, 140, 220)]),
+    ]:
+        save_dir.mkdir(parents=True, exist_ok=True)
+        existing = len(list(save_dir.glob("*.png")))
+        needed = max(0, count - existing)
+        for i in range(needed):
+            # Create varied synthetic images
+            w, h = 1280, 800
+            bg = random.choice(colors)
+            img = Image.new("RGB", (w, h), bg)
+            draw = ImageDraw.Draw(img)
+            # Add shapes
+            for _ in range(random.randint(5, 15)):
+                x1 = random.randint(0, w - 100)
+                y1 = random.randint(0, h - 100)
+                x2 = x1 + random.randint(50, 300)
+                y2 = y1 + random.randint(30, 200)
+                color = tuple(random.randint(0, 255) for _ in range(3))
+                draw.rectangle([x1, y1, x2, y2], fill=color)
+            # Add text-like rectangles
+            for _ in range(random.randint(3, 8)):
+                x = random.randint(100, w - 400)
+                y = random.randint(100, h - 100)
+                draw.rectangle([x, y, x + random.randint(100, 300), y + 20],
+                               fill=(255, 255, 255))
+            img.save(save_dir / f"synthetic_{i:04d}.png")
+    logger.info(f"Generated synthetic screenshots in {phishing_dir.parent}")
+if __name__ == "__main__":
+    main()

train_gnn.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# ============================================================
+# PhishGuard AI - gnn/train_gnn.py
+# Full GNN training script.
+#
+# Downloads PhishTank bz2 + TRANCO zip + Kaggle CSV mirror
+# Builds training graphs, 40 epochs, saves gnn_weights.pt
+# 70/15/15 train/val/test split with stratification
+# Saves replay buffer to gnn_replay_buffer.pt
+# ============================================================
+from __future__ import annotations
+import sys
+import random
+import logging
+from pathlib import Path
+from typing import List, Tuple
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)-7s | %(message)s",
+)
+logger = logging.getLogger("phishguard.gnn.train")
+# Paths
+GNN_DIR = Path(__file__).parent
+BACKEND_DIR = GNN_DIR.parent
+WEIGHTS_PATH = GNN_DIR / "gnn_weights.pt"
+REPLAY_BUFFER_PATH = BACKEND_DIR / "data" / "gnn_replay_buffer.pt"
+# Add backend to path for imports
+sys.path.insert(0, str(BACKEND_DIR))
+sys.path.insert(0, str(GNN_DIR))
+def main() -> None:
+    print("=" * 60)
+    print("PhishGuard AI — GNN Training")
+    print("=" * 60)
+    import torch
+    import torch.nn.functional as F
+    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+    from domain_graph_builder import DomainGraphBuilder
+    from gnn_model import PhishGNN, PhishMLP, PYGEOM_AVAILABLE, INPUT_DIM
+    # ── Download data ────────────────────────────────────────────
+    from data_collector import download_phishtank, download_tranco, merge_datasets
+    print("\n📥 Downloading datasets...")
+    phish_urls = download_phishtank(max_urls=50)
+    legit_urls = download_tranco(n=50)
+    print(f"   Phishing URLs: {len(phish_urls)}")
+    print(f"   Legitimate URLs: {len(legit_urls)}")
+    train_data, val_data, test_data = merge_datasets(phish_urls, legit_urls)
+    # ── Build graphs ─────────────────────────────────────────────
+    builder = DomainGraphBuilder()
+    CHUNK_SIZE = 4  # Group URLs into small graphs
+    def build_dataset(data: List[Tuple[str, int]], desc: str) -> list:
+        """Build graph dataset from (url, label) pairs."""
+        dataset = []
+        # Separate by label
+        phish = [url for url, label in data if label == 1]
+        legit = [url for url, label in data if label == 0]
+        for urls, label in [(phish, 1), (legit, 0)]:
+            for i in range(0, len(urls), CHUNK_SIZE):
+                chunk = urls[i : i + CHUNK_SIZE]
+                if not chunk:
+                    continue
+                graph = builder.build_graph(chunk)
+                x = torch.tensor(graph["features"], dtype=torch.float)
+                edges = graph["edges"]
+                if edges and len(edges) > 0:
+                    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
+                else:
+                    # Self-loops for graphs with no edges
+                    n = x.size(0)
+                    edge_index = torch.arange(n).unsqueeze(0).repeat(2, 1)
+                dataset.append({
+                    "x": x,
+                    "edge_index": edge_index,
+                    "y": torch.tensor([float(label)]),
+                })
+        random.shuffle(dataset)
+        print(f"   {desc}: {len(dataset)} graphs")
+        return dataset
+    print("\n🔨 Building graphs...")
+    train_graphs = build_dataset(train_data, "Train")
+    val_graphs = build_dataset(val_data, "Val")
+    test_graphs = build_dataset(test_data, "Test")
+    # ── Create model ─────────────────────────────────────────────
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"\n🤖 Device: {device}")
+    model = PhishGNN() if PYGEOM_AVAILABLE else PhishMLP()
+    model = model.to(device)
+    model_type = "GCN" if PYGEOM_AVAILABLE else "MLP"
+    print(f"   Model: Phish{model_type}")
+    print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")
+    # ── Training ─────────────────────────────────────────────────
+    EPOCHS = 2
+    LR = 0.001
+    WEIGHT_DECAY = 1e-4
+    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5, min_lr=1e-6,
+    )
+    loss_fn = F.binary_cross_entropy
+    best_val_acc = 0.0
+    best_epoch = 0
+    print(f"\n🏋️ Training for {EPOCHS} epochs...")
+    print(f"   {'Epoch':>5} | {'Loss':>8} | {'Train Acc':>9} | {'Val Acc':>7} | {'LR':>10}")
+    print(f"   {'─' * 5} | {'─' * 8} | {'─' * 9} | {'─' * 7} | {'─' * 10}")
+    for epoch in range(1, EPOCHS + 1):
+        # ── Train ────────────────────────────────────────────────
+        model.train()
+        total_loss = 0.0
+        train_preds = []
+        train_labels = []
+        random.shuffle(train_graphs)
+        for item in train_graphs:
+            x = item["x"].to(device)
+            ei = item["edge_index"].to(device)
+            y = item["y"].to(device)
+            optimizer.zero_grad()
+            out = model(x, ei)
+            loss = loss_fn(out.squeeze(), y.squeeze())
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            pred = 1 if out.squeeze().item() >= 0.5 else 0
+            train_preds.append(pred)
+            train_labels.append(int(y.item()))
+        avg_loss = total_loss / max(len(train_graphs), 1)
+        train_acc = accuracy_score(train_labels, train_preds)
+        # ── Validate ─────────────────────────────────────────────
+        model.eval()
+        val_preds = []
+        val_labels = []
+        with torch.no_grad():
+            for item in val_graphs:
+                x = item["x"].to(device)
+                ei = item["edge_index"].to(device)
+                y = item["y"].to(device)
+                out = model(x, ei)
+                pred = 1 if out.squeeze().item() >= 0.5 else 0
+                val_preds.append(pred)
+                val_labels.append(int(y.item()))
+        val_acc = accuracy_score(val_labels, val_preds) if val_labels else 0.0
+        scheduler.step(avg_loss)
+        current_lr = optimizer.param_groups[0]["lr"]
+        # Print progress
+        if epoch % 5 == 0 or epoch == 1:
+            print(f"   {epoch:>5} | {avg_loss:>8.4f} | {train_acc:>9.4f} | {val_acc:>7.4f} | {current_lr:>10.6f}")
+        # Save best model
+        if val_acc > best_val_acc:
+            best_val_acc = val_acc
+            best_epoch = epoch
+            torch.save(model.state_dict(), WEIGHTS_PATH)
+    print(f"\n   Best val accuracy: {best_val_acc:.4f} at epoch {best_epoch}")
+    # ── Test ─────────────────────────────────────────────────────
+    # Reload best weights
+    model.load_state_dict(
+        torch.load(WEIGHTS_PATH, map_location=device, weights_only=True)
+    )
+    model.eval()
+    test_preds = []
+    test_labels = []
+    with torch.no_grad():
+        for item in test_graphs:
+            x = item["x"].to(device)
+            ei = item["edge_index"].to(device)
+            y = item["y"].to(device)
+            out = model(x, ei)
+            pred = 1 if out.squeeze().item() >= 0.5 else 0
+            test_preds.append(pred)
+            test_labels.append(int(y.item()))
+    test_acc = accuracy_score(test_labels, test_preds) if test_labels else 0.0
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        test_labels, test_preds, average="binary", zero_division=0,
+    )
+    print(f"\n📊 Test Results:")
+    print(f"   Accuracy:  {test_acc:.4f}")
+    print(f"   Precision: {precision:.4f}")
+    print(f"   Recall:    {recall:.4f}")
+    print(f"   F1 Score:  {f1:.4f}")
+    # ── Save replay buffer ───────────────────────────────────────
+    REPLAY_BUFFER_PATH.parent.mkdir(parents=True, exist_ok=True)
+    replay_buffer = train_graphs[:500]  # Keep last 500 samples
+    torch.save(replay_buffer, REPLAY_BUFFER_PATH)
+    print(f"\n💾 Replay buffer saved: {len(replay_buffer)} samples → {REPLAY_BUFFER_PATH}")
+    print(f"\n✅ GNN weights saved to: {WEIGHTS_PATH}")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

url_heuristics.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# ============================================================
+# PhishGuard AI - url_heuristics.py
+# Tier 2: Heuristic Rule Engine — 15 independent signals
+#
+# Pure Python regex ONLY — ZERO I/O, ZERO ML, ZERO network
+# All regex patterns precompiled in __init__ for < 2ms latency
+# Max raw score: 135 → normalized to 0-100
+# Decision: score >= 80 → BLOCK | < 80 → pass to Tier 3
+# ============================================================
+from __future__ import annotations
+import re
+import math
+import time
+from dataclasses import dataclass, field
+from typing import List, Tuple
+from urllib.parse import urlparse, parse_qs
+@dataclass
+class HeuristicResult:
+    """Result from the Tier 2 heuristic scoring engine."""
+    score: int                    # 0-100 normalized score
+    signals: List[str] = field(default_factory=list)  # human-readable triggered rules
+    raw_score: int = 0            # pre-normalization total (max 135)
+MAX_RAW_SCORE: int = 135
+class HeuristicScorer:
+    """
+    Tier 2 Heuristic Rule Engine.
+    Scores URLs 0-100 across 15 independent regex/math signals.
+    All regex patterns are precompiled in __init__ (called once at startup).
+    The score() method runs all 15 checks in < 2ms on standard hardware.
+    """
+    def __init__(self) -> None:
+        # ── Precompile ALL regex patterns (called once) ──────────────
+        self._re_ip_hostname = re.compile(
+            r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$"
+        )
+        self._suspicious_tlds: frozenset[str] = frozenset({
+            ".xyz", ".tk", ".ml", ".ga", ".cf",
+            ".gq", ".pw", ".top", ".click",
+        })
+        self._phishing_keywords: Tuple[str, ...] = (
+            "login", "verify", "secure", "update", "account",
+            "banking", "signin", "reset", "confirm", "suspend",
+            "webscr", "cmd", "payment", "alert",
+        )
+        self._re_phishing_keywords = re.compile(
+            r"(?:" + "|".join(re.escape(kw) for kw in self._phishing_keywords) + r")",
+            re.IGNORECASE,
+        )
+        self._brand_names: Tuple[str, ...] = (
+            "paypal", "google", "apple", "microsoft", "amazon",
+            "netflix", "facebook", "instagram", "chase",
+            "wellsfargo", "bankofamerica",
+        )
+        self._brand_legitimate_domains: dict[str, frozenset[str]] = {
+            brand: frozenset({
+                f"{brand}.com", f"www.{brand}.com",
+                f"{brand}.org", f"{brand}.net",
+            })
+            for brand in self._brand_names
+        }
+        self._re_brands = re.compile(
+            r"(?:" + "|".join(re.escape(b) for b in self._brand_names) + r")",
+            re.IGNORECASE,
+        )
+        self._re_non_standard_port = re.compile(
+            r":(\d+)", re.IGNORECASE,
+        )
+        self._standard_ports: frozenset[int] = frozenset({80, 443, 8080})
+        self._re_double_slash = re.compile(r"(?<=.)//")
+        self._re_url_encoded = re.compile(r"%[0-9A-Fa-f]{2}")
+    # ── Public API ───────────────────────────────────────────────────
+    def score(self, url: str) -> HeuristicResult:
+        """
+        Score a raw URL string from 0-100 for phishing probability.
+        Runs all 15 checks. Returns HeuristicResult with signals.
+        """
+        raw_score: int = 0
+        signals: List[str] = []
+        # Parse URL once, reuse across all checks
+        try:
+            parsed = urlparse(url if "://" in url else f"http://{url}")
+        except Exception:
+            return HeuristicResult(score=0, signals=["parse_error"], raw_score=0)
+        hostname: str = (parsed.hostname or "").lower()
+        path: str = parsed.path or ""
+        query: str = parsed.query or ""
+        url_lower: str = url.lower()
+        # Extract domain (without subdomains) for brand check
+        host_parts = hostname.split(".")
+        domain = ".".join(host_parts[-2:]) if len(host_parts) >= 2 else hostname
+        # Run all 15 checks
+        checks: List[Tuple[int, str]] = [
+            self._check_ip_hostname(hostname),
+            self._check_suspicious_tld(hostname),
+            self._check_phishing_keywords(url_lower),
+            self._check_brand_spoofing(url_lower, domain),
+            self._check_subdomain_depth(hostname),
+            self._check_url_length(url),
+            self._check_domain_length(hostname),
+            self._check_hyphen_count(hostname),
+            self._check_digit_ratio(hostname),
+            self._check_shannon_entropy(hostname),
+            self._check_non_standard_port(parsed.netloc),
+            self._check_double_slash_redirect(path),
+            self._check_url_encoding(url),
+            self._check_query_length(query),
+            self._check_path_depth(path),
+        ]
+        for points, signal in checks:
+            if points > 0:
+                raw_score += points
+                signals.append(signal)
+        # Normalize: raw_score / 135 * 100, capped at 100
+        normalized = min(round(raw_score / MAX_RAW_SCORE * 100), 100)
+        return HeuristicResult(
+            score=normalized,
+            signals=signals,
+            raw_score=raw_score,
+        )
+    # ── 15 Individual Signal Checks ──────────────────────────────────
+    def _check_ip_hostname(self, hostname: str) -> Tuple[int, str]:
+        """Signal 1: IP address as hostname (25 points)."""
+        if self._re_ip_hostname.match(hostname):
+            return 25, "IP address as hostname"
+        return 0, ""
+    def _check_suspicious_tld(self, hostname: str) -> Tuple[int, str]:
+        """Signal 2: Suspicious/cheap TLD (20 points)."""
+        for tld in self._suspicious_tlds:
+            if hostname.endswith(tld):
+                return 20, f"Suspicious TLD ({tld})"
+        return 0, ""
+    def _check_phishing_keywords(self, url_lower: str) -> Tuple[int, str]:
+        """Signal 3: Phishing keywords in URL (15 points)."""
+        matches = self._re_phishing_keywords.findall(url_lower)
+        if matches:
+            unique = set(m.lower() for m in matches)
+            return 15, f"Phishing keywords: {', '.join(sorted(unique))}"
+        return 0, ""
+    def _check_brand_spoofing(self, url_lower: str, domain: str) -> Tuple[int, str]:
+        """Signal 4: Brand name in URL but wrong domain (15 points)."""
+        matches = self._re_brands.findall(url_lower)
+        for brand_match in matches:
+            brand = brand_match.lower()
+            legit_domains = self._brand_legitimate_domains.get(brand, frozenset())
+            # Check if the domain IS the legitimate brand domain
+            if domain not in legit_domains and f"www.{domain}" not in legit_domains:
+                return 15, f"Brand spoofing: '{brand}' on non-brand domain"
+        return 0, ""
+    def _check_subdomain_depth(self, hostname: str) -> Tuple[int, str]:
+        """Signal 5: Excessive subdomains >= 3 (10 points)."""
+        parts = hostname.split(".")
+        subdomain_count = max(0, len(parts) - 2)
+        if subdomain_count >= 3:
+            return 10, f"Excessive subdomains ({subdomain_count})"
+        return 0, ""
+    def _check_url_length(self, url: str) -> Tuple[int, str]:
+        """Signal 6: URL length > 100 chars (5 points)."""
+        if len(url) > 100:
+            return 5, f"Long URL ({len(url)} chars)"
+        return 0, ""
+    def _check_domain_length(self, hostname: str) -> Tuple[int, str]:
+        """Signal 7: Domain length > 30 chars (5 points)."""
+        if len(hostname) > 30:
+            return 5, f"Long domain ({len(hostname)} chars)"
+        return 0, ""
+    def _check_hyphen_count(self, hostname: str) -> Tuple[int, str]:
+        """Signal 8: Hyphen count >= 3 in domain (5 points)."""
+        count = hostname.count("-")
+        if count >= 3:
+            return 5, f"Excessive hyphens in domain ({count})"
+        return 0, ""
+    def _check_digit_ratio(self, hostname: str) -> Tuple[int, str]:
+        """Signal 9: Digit ratio in domain > 0.3 (5 points)."""
+        if not hostname:
+            return 0, ""
+        digits = sum(1 for c in hostname if c.isdigit())
+        ratio = digits / len(hostname)
+        if ratio > 0.3:
+            return 5, f"High digit ratio in domain ({ratio:.2f})"
+        return 0, ""
+    def _check_shannon_entropy(self, hostname: str) -> Tuple[int, str]:
+        """Signal 10: High Shannon entropy > 3.5 (5 points)."""
+        if not hostname:
+            return 0, ""
+        length = len(hostname)
+        freq: dict[str, int] = {}
+        for c in hostname:
+            freq[c] = freq.get(c, 0) + 1
+        entropy = -sum(
+            (count / length) * math.log2(count / length)
+            for count in freq.values()
+            if count > 0
+        )
+        if entropy > 3.5:
+            return 5, f"High entropy domain ({entropy:.2f})"
+        return 0, ""
+    def _check_non_standard_port(self, netloc: str) -> Tuple[int, str]:
+        """Signal 11: Non-standard port in URL (5 points)."""
+        match = self._re_non_standard_port.search(netloc)
+        if match:
+            port = int(match.group(1))
+            if port not in self._standard_ports:
+                return 5, f"Non-standard port (:{port})"
+        return 0, ""
+    def _check_double_slash_redirect(self, path: str) -> Tuple[int, str]:
+        """Signal 12: Double slash redirect in path (5 points)."""
+        if self._re_double_slash.search(path):
+            return 5, "Double-slash redirect in path"
+        return 0, ""
+    def _check_url_encoding(self, url: str) -> Tuple[int, str]:
+        """Signal 13: URL-encoded characters > 5 (5 points)."""
+        encoded_chars = self._re_url_encoded.findall(url)
+        if len(encoded_chars) > 5:
+            return 5, f"Excessive URL encoding ({len(encoded_chars)} encoded chars)"
+        return 0, ""
+    def _check_query_length(self, query: str) -> Tuple[int, str]:
+        """Signal 14: Query string length > 200 (3 points)."""
+        if len(query) > 200:
+            return 3, f"Long query string ({len(query)} chars)"
+        return 0, ""
+    def _check_path_depth(self, path: str) -> Tuple[int, str]:
+        """Signal 15: Path depth > 6 levels (2 points)."""
+        segments = [s for s in path.split("/") if s]
+        if len(segments) > 6:
+            return 2, f"Deep path ({len(segments)} levels)"
+        return 0, ""
+# ── Legacy compatibility wrapper ─────────────────────────────────────
+_default_scorer = HeuristicScorer()
+def analyze_url(url: str) -> dict:
+    """
+    Legacy-compatible wrapper around HeuristicScorer.
+    Returns dict with 'score', 'flags', 'is_suspicious' for backward compat.
+    """
+    result = _default_scorer.score(url)
+    return {
+        "score": result.score,
+        "flags": result.signals,
+        "is_suspicious": result.score >= 40,
+        "raw_score": result.raw_score,
+    }
+# ── Benchmark (run directly to test latency) ─────────────────────────
+if __name__ == "__main__":
+    test_urls = [
+        "https://www.google.com",
+        "http://192.168.1.1/admin/login",
+        "https://paypal-secure-login.xyz/verify/account?id=12345",
+        "https://a.b.c.d.evil.com/login/secure/update/verify",
+        "https://secure-login-bank-now.tk:9999/path/a/b/c/d/e/f/g.php",
+        "http://xn--pypal-4ve.com/signin?token=%2F%40%3A%20%2F%40%3A%20extra&" + "a" * 200,
+        "https://www.amazon.com/dp/B0something",
+        "https://microsft-l0gin-verfy.ga/account/suspended//evil.com/reset",
+    ]
+    scorer = HeuristicScorer()
+    print("=" * 70)
+    print("PhishGuard Tier 2 — Heuristic Benchmark")
+    print("=" * 70)
+    times: list[float] = []
+    for url in test_urls:
+        start = time.perf_counter()
+        result = scorer.score(url)
+        elapsed_us = (time.perf_counter() - start) * 1_000_000
+        times.append(elapsed_us)
+        action = "BLOCK" if result.score >= 80 else "→ Tier 3"
+        print(f"\n  URL: {url[:80]}...")
+        print(f"  Score: {result.score}/100 (raw {result.raw_score}/{MAX_RAW_SCORE}) → {action}")
+        if result.signals:
+            for sig in result.signals:
+                print(f"    ⚡ {sig}")
+        print(f"  Latency: {elapsed_us:.0f}µs")
+    print("\n" + "=" * 70)
+    print(f"  Avg latency: {sum(times)/len(times):.0f}µs")
+    print(f"  Max latency: {max(times):.0f}µs")
+    print(f"  Target: < 2000µs (2ms)")
+    print(f"  Result: {'✅ PASS' if max(times) < 2000 else '❌ FAIL'}")
+    print("=" * 70)

visual_analyzer.py ADDED Viewed

	@@ -0,0 +1,512 @@

+# ============================================================
+# PhishGuard AI - visual_analyzer.py
+# Takes a screenshot of a webpage using a headless browser
+# and analyzes it for visual phishing indicators.
+#
+# Screenshot parameters (from architecture doc 2.3):
+#   Viewport:    1280×800 (standard desktop resolution)
+#   Timeout:     10s (prevent hanging on slow/malicious pages)
+#   Wait:        domcontentloaded (faster than networkidle)
+#   Blocked:     fonts, media, video (60-70% faster load)
+#   User-Agent:  Chrome 120 string (avoid bot detection)
+#
+# Tier 4 is OPTIONAL — controlled by env var ENABLE_VISUAL_TIER.
+#   Set ENABLE_VISUAL_TIER=1 to enable.
+#   Unset / set 0 → tier 4 is skipped with "tier4_disabled".
+#
+# Render.com: If deploying with Playwright, your render.yaml
+#   build command must install Chromium deps. See render.yaml
+#   comments and the Dockerfile for required apt packages.
+#
+# Latency budget: < 200ms for screenshot capture
+# ============================================================
+from __future__ import annotations
+import os
+import re
+import time
+import hashlib
+import logging
+from urllib.parse import urlparse
+logger = logging.getLogger("phishguard.visual")
+# ── Environment gate ─────────────────────────────────────────────────────────
+ENABLE_VISUAL_TIER = os.environ.get("ENABLE_VISUAL_TIER", "0").strip() in ("1", "true", "yes")
+if not ENABLE_VISUAL_TIER:
+    print("[PhishGuard] Tier 4 visual analysis DISABLED (set ENABLE_VISUAL_TIER=1 to enable)")
+# ── Playwright availability ──────────────────────────────────────────────────
+PLAYWRIGHT_AVAILABLE = False
+if ENABLE_VISUAL_TIER:
+    try:
+        from playwright.async_api import async_playwright
+        PLAYWRIGHT_AVAILABLE = True
+        print("[PhishGuard] Playwright available — screenshot capture enabled")
+    except ImportError:
+        print("[PhishGuard] Playwright not installed — visual analysis will use heuristic-only mode")
+# ── PIL availability ─────────────────────────────────────────────────────────
+_pil_available = False
+try:
+    from PIL import Image
+    import io as _io
+    _pil_available = True
+except ImportError:
+    print("[PhishGuard] Pillow not available — color analysis disabled")
+# ── Screenshot cache config ──────────────────────────────────────────────────
+_CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "screenshots")
+_CACHE_TTL = 24 * 60 * 60  # 24 hours in seconds
+os.makedirs(_CACHE_DIR, exist_ok=True)
+# ── Brand / financial keyword databases ──────────────────────────────────────
+BRAND_DATABASE = {
+    # brand_keyword → list of legitimate domains
+    "paypal":     ["paypal.com"],
+    "apple":      ["apple.com", "icloud.com"],
+    "google":     ["google.com", "gmail.com", "accounts.google.com"],
+    "amazon":     ["amazon.com", "amazon.co.uk", "aws.amazon.com"],
+    "microsoft":  ["microsoft.com", "live.com", "outlook.com", "office.com"],
+    "netflix":    ["netflix.com"],
+    "facebook":   ["facebook.com", "fb.com"],
+    "instagram":  ["instagram.com"],
+    "chase":      ["chase.com"],
+    "wellsfargo": ["wellsfargo.com"],
+    "bankofamerica": ["bankofamerica.com"],
+    "citibank":   ["citibank.com", "citi.com"],
+    "hsbc":       ["hsbc.com"],
+    "hdfc":       ["hdfcbank.com"],
+    "icici":      ["icicibank.com"],
+    "sbi":        ["onlinesbi.com", "sbi.co.in"],
+}
+FINANCIAL_BRANDS = {
+    "paypal", "chase", "wellsfargo", "bankofamerica", "citibank",
+    "hsbc", "hdfc", "icici", "sbi", "bank", "banking",
+}
+def _domain_hash(url: str) -> str:
+    """Generate a stable hash for screenshot caching based on the domain."""
+    try:
+        parsed = urlparse(url if url.startswith("http") else "http://" + url)
+        host = parsed.hostname or url
+        return hashlib.sha256(host.encode()).hexdigest()[:16]
+    except Exception:
+        return hashlib.sha256(url.encode()).hexdigest()[:16]
+def _get_root_domain(url: str) -> str:
+    """Extract root domain from URL. E.g. https://login.paypal.com → paypal.com"""
+    try:
+        parsed = urlparse(url if url.startswith("http") else "http://" + url)
+        host = (parsed.hostname or "").lower().replace("www.", "")
+        parts = host.split(".")
+        return ".".join(parts[-2:]) if len(parts) >= 2 else host
+    except Exception:
+        return ""
+# ═════════════════════════════════════════════���════════════════════════════════
+#  SCREENSHOT CAPTURE (with cache)
+# ══════════════════════════════════════════════════════════════════════════════
+def _get_cached_screenshot(url: str) -> bytes | None:
+    """
+    Check if a cached screenshot exists for this domain and is < 24 hours old.
+    Returns the screenshot bytes or None.
+    """
+    dhash = _domain_hash(url)
+    cache_path = os.path.join(_CACHE_DIR, f"{dhash}.png")
+    if not os.path.exists(cache_path):
+        return None
+    # Check age
+    age = time.time() - os.path.getmtime(cache_path)
+    if age >= _CACHE_TTL:
+        # Expired — delete stale cache
+        try:
+            os.remove(cache_path)
+        except OSError:
+            pass
+        return None
+    try:
+        with open(cache_path, "rb") as f:
+            data = f.read()
+        logger.info(f"Screenshot cache HIT | url={url} | age={age:.0f}s")
+        return data
+    except Exception:
+        return None
+def _save_screenshot_cache(url: str, data: bytes):
+    """Save screenshot bytes to cache as screenshots/<domain_hash>.png."""
+    try:
+        dhash = _domain_hash(url)
+        cache_path = os.path.join(_CACHE_DIR, f"{dhash}.png")
+        with open(cache_path, "wb") as f:
+            f.write(data)
+        logger.info(f"Screenshot cached | url={url} | path={cache_path}")
+    except Exception as e:
+        logger.warning(f"Screenshot cache write failed | error={e}")
+async def take_screenshot(url: str) -> bytes | None:
+    """
+    Open the URL in a hidden (headless) browser and take a screenshot.
+    The user never sees this browser window.
+    Uses a 24-hour cache: if screenshots/<domain_hash>.png exists and is
+    fresh, returns cached bytes without launching a browser.
+    Returns: screenshot as bytes, or None if it fails.
+    """
+    # Gate: tier 4 disabled
+    if not ENABLE_VISUAL_TIER:
+        return None
+    # Check cache first
+    cached = _get_cached_screenshot(url)
+    if cached is not None:
+        return cached
+    # Playwright not available — can't take a fresh screenshot
+    if not PLAYWRIGHT_AVAILABLE:
+        logger.warning(f"Screenshot skipped (no Playwright) | url={url}")
+        return None
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(
+                viewport={"width": 1280, "height": 800},
+                ignore_https_errors=True,
+                user_agent=(
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) "
+                    "Chrome/120.0.0.0 Safari/537.36"
+                )
+            )
+            page = await context.new_page()
+            # Block fonts and media to speed up loading (60-70% faster)
+            await page.route(
+                "**/*.{woff,woff2,ttf,mp4,mp3,wav}",
+                lambda route: route.abort()
+            )
+            await page.goto(url, timeout=10000, wait_until="domcontentloaded")
+            # ── Extract page metadata for heuristic analysis ──────────
+            page_title = await page.title() or ""
+            has_password_field = await page.locator("input[type='password']").count() > 0
+            screenshot = await page.screenshot(full_page=False)
+            await browser.close()
+        # Cache the screenshot for 24 hours
+        if screenshot:
+            _save_screenshot_cache(url, screenshot)
+        return screenshot
+    except Exception as e:
+        logger.error(f"Screenshot failed | url={url} | error={e}")
+        return None
+async def take_screenshot_with_metadata(url: str) -> dict:
+    """
+    Enhanced screenshot capture that also extracts page metadata
+    (title, login forms) for heuristic visual scoring.
+    Returns: {
+        "screenshot": bytes|None,
+        "page_title": str,
+        "has_password_field": bool,
+        "uses_https": bool,
+        "error": str|None
+    }
+    """
+    result = {
+        "screenshot": None,
+        "page_title": "",
+        "has_password_field": False,
+        "uses_https": url.lower().startswith("https"),
+        "error": None,
+    }
+    # Gate: tier 4 disabled
+    if not ENABLE_VISUAL_TIER:
+        result["error"] = "tier4_disabled"
+        return result
+    # Check screenshot cache (metadata won't be cached, just the image)
+    cached = _get_cached_screenshot(url)
+    if cached is not None:
+        result["screenshot"] = cached
+        # We can't get page metadata from cache, but we have the image
+        return result
+    if not PLAYWRIGHT_AVAILABLE:
+        result["error"] = "playwright_not_available"
+        return result
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(
+                viewport={"width": 1280, "height": 800},
+                ignore_https_errors=True,
+                user_agent=(
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) "
+                    "Chrome/120.0.0.0 Safari/537.36"
+                )
+            )
+            page = await context.new_page()
+            await page.route(
+                "**/*.{woff,woff2,ttf,mp4,mp3,wav}",
+                lambda route: route.abort()
+            )
+            await page.goto(url, timeout=10000, wait_until="domcontentloaded")
+            # Extract metadata
+            result["page_title"] = await page.title() or ""
+            result["has_password_field"] = await page.locator("input[type='password']").count() > 0
+            screenshot = await page.screenshot(full_page=False)
+            await browser.close()
+            result["screenshot"] = screenshot
+            # Cache the screenshot
+            if screenshot:
+                _save_screenshot_cache(url, screenshot)
+    except Exception as e:
+        result["error"] = str(e)
+        logger.error(f"Screenshot+metadata failed | url={url} | error={e}")
+    return result
+# ══════════════════════════════════════════════════════════════════════════════
+#  VISUAL PHISHING HEURISTICS (no CNN needed)
+# ══════════════════════════════════════════════════════════════════════════════
+def analyze_visual_heuristic(url: str, page_title: str = "",
+                              has_password_field: bool = False) -> dict:
+    """
+    Heuristic visual phishing scoring WITHOUT needing a trained CNN.
+    Returns heuristic_visual_score from 0.0 to 1.0 based on:
+    Signal 1: Page title contains brand names but domain doesn't match
+    Signal 2: Page has a login form (input[type=password])
+    Signal 3: SSL cert missing for pages mentioning financial brands
+    Signal 4: Brand keyword in URL path but not in domain (path spoofing)
+    Returns: {
+        heuristic_visual_score: float 0..1,
+        flags: list[str],
+        brand_mismatch: bool,
+        has_login_form: bool,
+        ssl_missing_financial: bool
+    }
+    """
+    score = 0.0
+    flags = []
+    brand_mismatch = False
+    ssl_missing_financial = False
+    root_domain = _get_root_domain(url)
+    url_lower = url.lower()
+    title_lower = (page_title or "").lower()
+    uses_https = url_lower.startswith("https")
+    # ── Signal 1: Brand name in page title but domain doesn't match ───────
+    for brand, legit_domains in BRAND_DATABASE.items():
+        if brand in title_lower:
+            if not any(d in root_domain for d in legit_domains):
+                score += 0.30
+                flags.append(f"title_brand_mismatch:{brand}")
+                brand_mismatch = True
+                break  # One brand mismatch is enough
+    # ── Signal 2: Login form detected (input[type=password]) ──────────────
+    if has_password_field:
+        score += 0.15
+        flags.append("has_password_field")
+        # Extra risk if combined with brand mismatch
+        if brand_mismatch:
+            score += 0.15
+            flags.append("login_form_with_brand_mismatch")
+    # ── Signal 3: No SSL for financial brand content ──────────────────────
+    mentions_financial = any(
+        fb in title_lower or fb in url_lower
+        for fb in FINANCIAL_BRANDS
+    )
+    if mentions_financial and not uses_https:
+        score += 0.25
+        flags.append("no_ssl_financial_content")
+        ssl_missing_financial = True
+    # ── Signal 4: Brand keyword in URL path but not in domain ─────────────
+    try:
+        parsed = urlparse(url)
+        path = (parsed.path or "").lower()
+        for brand, legit_domains in BRAND_DATABASE.items():
+            if brand in path and not any(d in root_domain for d in legit_domains):
+                score += 0.15
+                flags.append(f"brand_in_path_not_domain:{brand}")
+                break
+    except Exception:
+        pass
+    return {
+        "heuristic_visual_score": round(min(score, 1.0), 4),
+        "flags":                  flags,
+        "brand_mismatch":         brand_mismatch,
+        "has_login_form":         has_password_field,
+        "ssl_missing_financial":  ssl_missing_financial,
+    }
+def analyze_visual_basic(screenshot_bytes: bytes, url: str) -> dict:
+    """
+    Basic visual analysis using color histograms.
+    Detects if a page uses colors associated with known brands
+    but the URL doesn't match that brand.
+    Note: For full CNN analysis, see cnn/cnn_model.py
+    """
+    if not screenshot_bytes:
+        return {"visual_risk": 0.1, "note": "screenshot_failed"}
+    if not _pil_available:
+        return {"visual_risk": 0.1, "note": "pil_not_available"}
+    try:
+        img = Image.open(_io.BytesIO(screenshot_bytes)).convert("RGB")
+        img_small = img.resize((224, 224))
+        # Get average color channels
+        r_vals = list(img_small.split()[0].getdata())
+        g_vals = list(img_small.split()[1].getdata())
+        b_vals = list(img_small.split()[2].getdata())
+        r_avg = sum(r_vals) / len(r_vals)
+        g_avg = sum(g_vals) / len(g_vals)
+        b_avg = sum(b_vals) / len(b_vals)
+        risk = 0.2  # baseline
+        url_lower = url.lower()
+        # PayPal brand colors: deep blue
+        if b_avg > r_avg * 1.4 and b_avg > g_avg * 1.3:
+            if "paypal" not in url_lower:
+                risk += 0.25
+        # Microsoft brand colors: orange/blue
+        if r_avg > 180 and b_avg < 100:
+            if "microsoft" not in url_lower and "office" not in url_lower:
+                risk += 0.20
+        # Apple brand: mostly white/grey
+        if r_avg > 220 and g_avg > 220 and b_avg > 220:
+            if "apple" not in url_lower:
+                risk += 0.10
+        return {
+            "visual_risk":   round(min(risk, 1.0), 4),
+            "dominant_rgb":  [round(r_avg), round(g_avg), round(b_avg)],
+            "note":          "basic_color_analysis"
+        }
+    except Exception as e:
+        return {"visual_risk": 0.1, "note": "analysis_error"}
+# ══════════════════════════════════════════════════════════════════════════════
+#  FULL TIER 4 ANALYSIS (combines CNN + heuristics + color)
+# ══════════════════════════════════════════════════════════════════════════════
+async def run_tier4_analysis(url: str, page_title: str = "",
+                              page_snippet: str = "") -> dict:
+    """
+    Complete Tier 4 visual analysis pipeline.
+    Called by main.py for borderline cases (0.40 ≤ P₃ < 0.85).
+    Graceful fallback chain:
+      1. If ENABLE_VISUAL_TIER is off   → tier4_disabled
+      2. If screenshot fails            → screenshot_failed (with heuristic fallback)
+      3. If CNN fails                   → uses heuristic_visual_score only
+    Returns: {
+        tier4_score:   float|None,
+        tier4_status:  str ("ok"|"screenshot_failed"|"tier4_disabled"|...),
+        tier4_reason:  str,
+        visual_heuristic: dict,
+        color_analysis: dict,
+        screenshot_cached: bool
+    }
+    """
+    # ── Gate: completely skip if not enabled ───────────────────────────────
+    if not ENABLE_VISUAL_TIER:
+        return {
+            "tier4_score":  None,
+            "tier4_status": "tier4_disabled",
+            "tier4_reason": "ENABLE_VISUAL_TIER env var not set",
+        }
+    # ── Attempt screenshot with metadata extraction ───────────────────────
+    meta = await take_screenshot_with_metadata(url)
+    screenshot = meta["screenshot"]
+    extracted_title = meta["page_title"] or page_title
+    has_password = meta["has_password_field"]
+    screenshot_error = meta["error"]
+    # ── Always run visual heuristics (no screenshot needed) ───────────────
+    heuristic = analyze_visual_heuristic(
+        url,
+        page_title=extracted_title,
+        has_password_field=has_password,
+    )
+    # ── Screenshot failed → return heuristic-only result ──────────────────
+    if screenshot is None:
+        reason = screenshot_error or "unknown_screenshot_error"
+        return {
+            "tier4_score":       heuristic["heuristic_visual_score"],
+            "tier4_status":      "screenshot_failed",
+            "tier4_reason":      reason,
+            "visual_heuristic":  heuristic,
+            "color_analysis":    None,
+            "screenshot_cached": False,
+        }
+    # ── Color-based analysis (works without trained CNN) ──────────────────
+    color = analyze_visual_basic(screenshot, url)
+    # ── Combine heuristic + color into a single tier4 score ───────────────
+    # Weight: 60% heuristic, 40% color (since CNN isn't trained)
+    combined = (heuristic["heuristic_visual_score"] * 0.60) + (color["visual_risk"] * 0.40)
+    return {
+        "tier4_score":       round(min(combined, 1.0), 4),
+        "tier4_status":      "ok",
+        "tier4_reason":      "heuristic_and_color_analysis",
+        "visual_heuristic":  heuristic,
+        "color_analysis":    color,
+        "screenshot_cached": _get_cached_screenshot(url) is not None,
+    }

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff