Spaces:
Sleeping
Sleeping
import os, re | |
import gradio as gr | |
# Keep Transformers quiet & CPU-only friendly | |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") | |
# -------- Config -------- | |
URL_MODEL_ID = "CrabInHoney/urlbert-tiny-v4-malicious-url-classifier" | |
URL_LABEL_MAP = { | |
"LABEL_0": "benign", | |
"LABEL_1": "defacement", | |
"LABEL_2": "malware", | |
"LABEL_3": "phishing", | |
} | |
URL_RE = re.compile(r"""(?xi)\b(?:https?://|www\.)[a-z0-9\-._~%]+(?:/[^\s<>"']*)?""") | |
_pipe = None # created on first analyze() | |
def _extract_urls(t: str): | |
return sorted(set(m.group(0) for m in URL_RE.finditer(t or ""))) | |
def _pretty(raw, id2label): | |
if id2label: | |
if raw in id2label: | |
return id2label[raw] | |
k = raw.replace("LABEL_", "") | |
if k in id2label: | |
return id2label[k] | |
return URL_LABEL_MAP.get(raw, raw) | |
def analyze(text: str) -> str: | |
text = (text or "").strip() | |
if not text: | |
return "Paste an email body or a URL." | |
# Use single-URL mode if it looks like one; else extract from email text | |
urls = [text] if (text.lower().startswith(("http://","https://","www.")) and " " not in text) else _extract_urls(text) | |
if not urls: | |
return "No URLs detected in the text." | |
# Lazy import + pipeline creation keeps startup instant | |
global _pipe | |
if _pipe is None: | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
tok = AutoTokenizer.from_pretrained(URL_MODEL_ID) | |
mdl = AutoModelForSequenceClassification.from_pretrained(URL_MODEL_ID) | |
_pipe = pipeline("text-classification", model=mdl, tokenizer=tok, device=-1, top_k=None) | |
id2label = getattr(_pipe.model.config, "id2label", None) | |
lines = [] | |
unsafe = False | |
for u in urls: | |
scores = sorted(_pipe(u)[0], key=lambda s: s["score"], reverse=True) | |
top = scores[0] | |
lbl = _pretty(top["label"], id2label) | |
conf = 100 * float(top["score"]) | |
lines.append(f"- **{u}** → **{lbl}** ({conf:.2f}%)") | |
if lbl.lower() in {"phishing", "malware", "defacement"}: | |
unsafe = True | |
verdict = "🔴 **UNSAFE (links flagged)**" if unsafe else "🟢 **SAFE (all links benign)**" | |
return verdict + "\n\n" + "\n".join(lines) | |
demo = gr.Interface( | |
fn=analyze, | |
inputs=gr.Textbox(lines=6, label="Email or URL", placeholder="Paste a URL or a full email…"), | |
outputs=gr.Markdown(label="Result"), | |
title="🛡️ Phishing Detector (via Link Analysis)", | |
description="We extract links and classify each with a compact malicious-URL model (CPU-only, free tier).", | |
) | |
if __name__ == "__main__": | |
demo.launch() | |