Spaces:

ai-assist-sh
/

PhishingMail-Forensics

Sleeping

App Files Files Community

ai-assist-sh commited on Aug 19

Commit

f9467b7

verified ·

1 Parent(s): 755ffe2

Update main.py

Browse files

Files changed (1) hide show

main.py +34 -63

main.py CHANGED Viewed

@@ -1,8 +1,9 @@
-import os, re, time, json, tempfile
 import gradio as gr
 import torch
 import torch.nn.functional as F
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 URL_MODEL_ID = "CrabInHoney/urlbert-tiny-v4-malicious-url-classifier"
@@ -70,19 +71,20 @@ def _forensic_block(url, token_ids, tokens, scores_sorted, cls_vec, elapsed_s, t
     md.append("```txt\n" + cls_preview + "\n```")
     return "\n".join(md)
-def analyze(text: str, forensic: bool, forensics_json: str):
     """
-    Returns:
-      - Markdown body
-      - Updated forensics_json (string)
     """
     text = (text or "").strip()
     if not text:
-        return "Paste an email body or a URL.", ""
     urls = [text] if (text.lower().startswith(("http://","https://","www.")) and " " not in text) else _extract_urls(text)
     if not urls:
-        return "No URLs detected in the text.", ""
     tok, mdl = _load_model()
     id2label_raw = getattr(mdl.config, "id2label", None) or {}
@@ -119,9 +121,9 @@ def analyze(text: str, forensic: bool, forensics_json: str):
             out = mdl(**enc, output_hidden_states=True)
         elapsed = time.time() - t0
-        logits = out.logits.squeeze(0)
-        probs  = _softmax(logits)
-        hidden_states = out.hidden_states
         cls_vec = hidden_states[-1][0, 0, :].cpu().tolist()
         per_class = [
@@ -141,7 +143,7 @@ def analyze(text: str, forensic: bool, forensics_json: str):
             "truncated": truncated,
             "logits": [float(x) for x in logits.cpu().tolist()],
             "probs":  [float(p) for p in probs],
-            "scores_sorted": per_class_sorted,
             "cls_vector": cls_vec,
             "cls_dim": len(cls_vec),
             "elapsed_sec": elapsed,
@@ -162,61 +164,30 @@ def analyze(text: str, forensic: bool, forensics_json: str):
     verdict = "🔴 **UNSAFE (links flagged)**" if unsafe else "🟢 **SAFE (all links benign)**"
     body = verdict + "\n\n" + _markdown_table(rows)
     if forensic and forensic_blocks:
         body += "\n\n---\n\n" + "\n\n---\n\n".join(forensic_blocks)
-    # Return JSON string (not dict) to avoid schema bug
-    return body, json.dumps(export_data, ensure_ascii=False)
-def export_forensics(forensics_json: str):
-    """Write the JSON string to a file and return the path."""
-    if not forensics_json:
-        return None
-    try:
-        data = json.loads(forensics_json)
-        if not isinstance(data, dict) or not data.get("items"):
-            return None
-    except Exception:
-        return None
-    fd, path = tempfile.mkstemp(prefix="forensics_", suffix=".json")
-    with os.fdopen(fd, "w", encoding="utf-8") as f:
-        f.write(forensics_json)
-    return path
-with gr.Blocks() as demo:
-    gr.Markdown("# 🛡️ PhishingMail — Forensics (Tokens, Logits, CLS)")
-    gr.Markdown(
-        "Paste an **email body** or a **URL**. We extract links and classify each with a compact malicious-URL model. "
-        "Enable **Forensic mode** to show tokens, logits, and the **[CLS] embedding**. "
-        "Use **Export** to download full forensics as JSON."
-    )
-    with gr.Row():
-        inp = gr.Textbox(lines=6, label="Email or URL", placeholder="Paste a URL or a full email…")
-    forensic_chk = gr.Checkbox(label="Forensic mode (tokens, logits, [CLS])", value=False)
-    # Hidden storage for forensics JSON (string)
-    forensics_json_store = gr.Textbox(value="", visible=False)
-    with gr.Row():
-        btn_analyze = gr.Button("Analyze", variant="primary")
-        btn_export  = gr.Button("Export forensics (JSON)")
-    out_md = gr.Markdown(label="Results")
-    out_file = gr.File(label="Download forensics JSON", interactive=False)
-    btn_analyze.click(
-        analyze,
-        inputs=[inp, forensic_chk, forensics_json_store],
-        outputs=[out_md, forensics_json_store],
-        show_progress=True,
-    )
-    btn_export.click(
-        export_forensics,
-        inputs=[forensics_json_store],
-        outputs=[out_file],
-    )
 if __name__ == "__main__":
-    # Extra-safe config for HF Spaces
     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

+import os, re, time, json
 import gradio as gr
 import torch
 import torch.nn.functional as F
+# Be quiet + CPU friendly
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 URL_MODEL_ID = "CrabInHoney/urlbert-tiny-v4-malicious-url-classifier"
     md.append("```txt\n" + cls_preview + "\n```")
     return "\n".join(md)
+def analyze(text: str, forensic: bool, show_json: bool):
     """
+    Returns a single Markdown block:
+      - verdict + compact table
+      - optional forensic blocks (tokens, logits, CLS)
+      - optional raw JSON (copy/paste)
     """
     text = (text or "").strip()
     if not text:
+        return "Paste an email body or a URL."
     urls = [text] if (text.lower().startswith(("http://","https://","www.")) and " " not in text) else _extract_urls(text)
     if not urls:
+        return "No URLs detected in the text."
     tok, mdl = _load_model()
     id2label_raw = getattr(mdl.config, "id2label", None) or {}
             out = mdl(**enc, output_hidden_states=True)
         elapsed = time.time() - t0
+        logits = out.logits.squeeze(0)             # (num_labels,)
+        probs = _softmax(logits)                    # list[float]
+        hidden_states = out.hidden_states           # tuple of layers
         cls_vec = hidden_states[-1][0, 0, :].cpu().tolist()
         per_class = [
             "truncated": truncated,
             "logits": [float(x) for x in logits.cpu().tolist()],
             "probs":  [float(p) for p in probs],
+            "scores_sorted": per_class_sorted,  # label+prob+logit
             "cls_vector": cls_vec,
             "cls_dim": len(cls_vec),
             "elapsed_sec": elapsed,
     verdict = "🔴 **UNSAFE (links flagged)**" if unsafe else "🟢 **SAFE (all links benign)**"
     body = verdict + "\n\n" + _markdown_table(rows)
     if forensic and forensic_blocks:
         body += "\n\n---\n\n" + "\n\n---\n\n".join(forensic_blocks)
+    if show_json:
+        # raw JSON for copy-paste (no File component needed)
+        pretty = json.dumps(export_data, ensure_ascii=False, indent=2)
+        body += "\n\n---\n\n**Raw forensics JSON (copy & save):**\n"
+        body += "```json\n" + pretty + "\n```"
+    return body
+demo = gr.Interface(
+    fn=analyze,
+    inputs=[
+        gr.Textbox(lines=6, label="Email or URL", placeholder="Paste a URL or a full email…"),
+        gr.Checkbox(label="Forensic mode (tokens, logits, [CLS])", value=True),
+        gr.Checkbox(label="Show raw JSON at the end (copy/paste)", value=False),
+    ],
+    outputs=gr.Markdown(label="Results"),
+    title="🛡️ PhishingMail — Forensics (HF Free CPU)",
+    description="Extract links, classify with a tiny URL model, and (optionally) view tokens, logits, and [CLS] embedding.",
+)
 if __name__ == "__main__":
+    # Safe defaults for HF Spaces (no share=True needed)
     demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)