elecie commited on
Commit
466b21e
·
1 Parent(s): 05c0375
Files changed (1) hide show
  1. app.py +34 -24
app.py CHANGED
@@ -1,17 +1,11 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
  import re
4
  import tldextract
5
  from rapidfuzz import fuzz
 
6
 
7
- # Load lightweight zero-shot model
8
- classifier = pipeline("zero-shot-classification", model="joeddav/distilbert-base-uncased-go-emotions")
9
-
10
-
11
- # Define categories
12
  LABELS = ["urgent", "fear", "authority", "financial scam", "safe"]
13
 
14
- # Regex backup cues
15
  CUES = {
16
  "urgency": [r"\burgent\b", r"\bimmediately\b", r"\bverify now\b", r"\blimited time\b"],
17
  "fear": [r"\bsuspended\b", r"\block(ed)?\b", r"\blegal action\b", r"\bunauthorized\b"],
@@ -23,9 +17,23 @@ TRUSTED_DOMAINS = ["google.com", "paypal.com", "microsoft.com", "amazon.com", "f
23
  SUSPICIOUS_TLDS = ["xyz", "top", "tk", "gq", "cf", "ml"]
24
  URL_PATTERN = re.compile(r"(https?://[^\s]+|www\.[^\s]+|\b[a-zA-Z0-9-]+\.[a-z]{2,}\b)")
25
 
26
-
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def regex_analysis(text):
28
- findings, score = [], 0
 
29
  for category, patterns in CUES.items():
30
  for pat in patterns:
31
  if re.search(pat, text, re.IGNORECASE):
@@ -33,9 +41,9 @@ def regex_analysis(text):
33
  score += 20
34
  return score, findings
35
 
36
-
37
  def huggingface_analysis(text):
38
- result = classifier(text, LABELS)
 
39
  label_scores = list(zip(result["labels"], result["scores"]))
40
  label_scores.sort(key=lambda x: x[1], reverse=True)
41
 
@@ -45,9 +53,9 @@ def huggingface_analysis(text):
45
 
46
  return hf_score, findings
47
 
48
-
49
  def url_analysis(url):
50
- findings, score = [], 0
 
51
  ext = tldextract.extract(url)
52
  domain = f"{ext.domain}.{ext.suffix}"
53
 
@@ -72,13 +80,11 @@ def url_analysis(url):
72
 
73
  return score, findings
74
 
75
-
76
  def extract_url_from_text(text):
77
  match = URL_PATTERN.search(text)
78
  return match.group(0) if match else None
79
 
80
-
81
- # Main analysis function (Gradio will call this)
82
  def analyze(text):
83
  regex_score, regex_findings = regex_analysis(text)
84
  hf_score, hf_findings = huggingface_analysis(text)
@@ -105,18 +111,22 @@ def analyze(text):
105
  return {
106
  "Score": total_score,
107
  "Risk Level": risk_level,
108
- "Reasons": reasons,
109
  "Extracted URL": url if url else "None detected"
110
  }
111
 
112
-
113
- # Gradio UI
114
  iface = gr.Interface(
115
  fn=analyze,
116
- inputs=gr.Textbox(lines=4, placeholder="Paste email/text here..."),
117
- outputs="json",
118
- title="Phishing Detection",
119
- description="Analyzes text and URLs for phishing risks."
 
 
 
 
 
120
  )
121
 
122
  if __name__ == "__main__":
 
 
 
1
  import re
2
  import tldextract
3
  from rapidfuzz import fuzz
4
+ import gradio as gr
5
 
6
+ # --- Labels & Regex ---
 
 
 
 
7
  LABELS = ["urgent", "fear", "authority", "financial scam", "safe"]
8
 
 
9
  CUES = {
10
  "urgency": [r"\burgent\b", r"\bimmediately\b", r"\bverify now\b", r"\blimited time\b"],
11
  "fear": [r"\bsuspended\b", r"\block(ed)?\b", r"\blegal action\b", r"\bunauthorized\b"],
 
17
  SUSPICIOUS_TLDS = ["xyz", "top", "tk", "gq", "cf", "ml"]
18
  URL_PATTERN = re.compile(r"(https?://[^\s]+|www\.[^\s]+|\b[a-zA-Z0-9-]+\.[a-z]{2,}\b)")
19
 
20
+ # --- Lazy-load Hugging Face model ---
21
+ classifier = None
22
+ def get_classifier():
23
+ global classifier
24
+ if classifier is None:
25
+ from transformers import pipeline
26
+ classifier = pipeline(
27
+ "zero-shot-classification",
28
+ model="valhalla/distilbart-mnli-12-1",
29
+ device=-1 # CPU
30
+ )
31
+ return classifier
32
+
33
+ # --- Analysis functions ---
34
  def regex_analysis(text):
35
+ findings = []
36
+ score = 0
37
  for category, patterns in CUES.items():
38
  for pat in patterns:
39
  if re.search(pat, text, re.IGNORECASE):
 
41
  score += 20
42
  return score, findings
43
 
 
44
  def huggingface_analysis(text):
45
+ clf = get_classifier()
46
+ result = clf(text, LABELS)
47
  label_scores = list(zip(result["labels"], result["scores"]))
48
  label_scores.sort(key=lambda x: x[1], reverse=True)
49
 
 
53
 
54
  return hf_score, findings
55
 
 
56
  def url_analysis(url):
57
+ findings = []
58
+ score = 0
59
  ext = tldextract.extract(url)
60
  domain = f"{ext.domain}.{ext.suffix}"
61
 
 
80
 
81
  return score, findings
82
 
 
83
  def extract_url_from_text(text):
84
  match = URL_PATTERN.search(text)
85
  return match.group(0) if match else None
86
 
87
+ # --- Main analyze function for Gradio ---
 
88
  def analyze(text):
89
  regex_score, regex_findings = regex_analysis(text)
90
  hf_score, hf_findings = huggingface_analysis(text)
 
111
  return {
112
  "Score": total_score,
113
  "Risk Level": risk_level,
114
+ "Reasons": "\n".join(reasons),
115
  "Extracted URL": url if url else "None detected"
116
  }
117
 
118
+ # --- Gradio Interface ---
 
119
  iface = gr.Interface(
120
  fn=analyze,
121
+ inputs=gr.Textbox(lines=5, placeholder="Paste text here..."),
122
+ outputs=[
123
+ gr.Textbox(label="Score"),
124
+ gr.Textbox(label="Risk Level"),
125
+ gr.Textbox(label="Reasons"),
126
+ gr.Textbox(label="Extracted URL")
127
+ ],
128
+ title="Phishing / Scam Detector",
129
+ description="Analyzes text for urgency, fear, authority, and financial scam cues, plus suspicious URLs."
130
  )
131
 
132
  if __name__ == "__main__":