Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, re, time, pickle, zipfile, shutil, urllib.request
|
| 2 |
+
from urllib.parse import urlparse
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Optional, List
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import Levenshtein
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
|
| 12 |
+
from fastapi import FastAPI, HTTPException
|
| 13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
+
from pydantic import BaseModel
|
| 15 |
+
from transformers import (
|
| 16 |
+
BertTokenizer, BertForSequenceClassification,
|
| 17 |
+
RobertaTokenizer, RobertaForSequenceClassification
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
# ββ Setup ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
app = FastAPI(title="AdaptiveShield API", version="1.0.0")
|
| 22 |
+
app.add_middleware(CORSMiddleware, allow_origins=["*"],
|
| 23 |
+
allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
|
| 24 |
+
|
| 25 |
+
DEVICE = torch.device("cpu")
|
| 26 |
+
MAX_LEN = 128
|
| 27 |
+
MAX_URL_LEN = 200
|
| 28 |
+
NUM_FEATURES = 30
|
| 29 |
+
|
| 30 |
+
TOP_DOMAINS = ["google.com","youtube.com","facebook.com","amazon.com",
|
| 31 |
+
"wikipedia.org","twitter.com","instagram.com","linkedin.com",
|
| 32 |
+
"microsoft.com","apple.com","netflix.com","paypal.com",
|
| 33 |
+
"ebay.com","reddit.com","github.com","stackoverflow.com",
|
| 34 |
+
"dropbox.com","spotify.com","adobe.com","yahoo.com"]
|
| 35 |
+
|
| 36 |
+
SUSPICIOUS_TLDS = [".xyz",".tk",".ml",".ga",".cf",".pw",".top",
|
| 37 |
+
".ru",".cn",".info",".biz",".click",".link"]
|
| 38 |
+
|
| 39 |
+
BRAND_KEYWORDS = ["paypal","amazon","google","microsoft","apple","facebook",
|
| 40 |
+
"netflix","bank","secure","login","verify","account",
|
| 41 |
+
"update","confirm","password","credit","debit","wallet"]
|
| 42 |
+
|
| 43 |
+
URL_CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_~:/?#[]@!$&()*+,;=%"
|
| 44 |
+
char_to_idx = {c: i+2 for i, c in enumerate(URL_CHARS)}
|
| 45 |
+
char_to_idx["<PAD>"] = 0
|
| 46 |
+
char_to_idx["<UNK>"] = 1
|
| 47 |
+
VOCAB_SIZE = len(char_to_idx)
|
| 48 |
+
|
| 49 |
+
feedback_store = []
|
| 50 |
+
scan_history = []
|
| 51 |
+
|
| 52 |
+
# ββ CNN Model ββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
class PhishingCNN(nn.Module):
|
| 54 |
+
def __init__(self, vocab_size=None, embed_dim=128, num_filters=128,
|
| 55 |
+
filter_sizes=[2,3,4,5], num_classes=2, dropout=0.5):
|
| 56 |
+
super().__init__()
|
| 57 |
+
vs = vocab_size or VOCAB_SIZE
|
| 58 |
+
self.embedding = nn.Embedding(vs, embed_dim, padding_idx=0)
|
| 59 |
+
self.convs = nn.ModuleList([
|
| 60 |
+
nn.Sequential(nn.Conv1d(embed_dim, num_filters, fs),
|
| 61 |
+
nn.BatchNorm1d(num_filters), nn.ReLU())
|
| 62 |
+
for fs in filter_sizes
|
| 63 |
+
])
|
| 64 |
+
total = num_filters * len(filter_sizes)
|
| 65 |
+
self.classifier = nn.Sequential(
|
| 66 |
+
nn.Dropout(dropout), nn.Linear(total, 256), nn.ReLU(),
|
| 67 |
+
nn.BatchNorm1d(256), nn.Dropout(dropout*0.6), nn.Linear(256, num_classes)
|
| 68 |
+
)
|
| 69 |
+
def forward(self, x):
|
| 70 |
+
emb = self.embedding(x).permute(0, 2, 1)
|
| 71 |
+
pooled = [F.max_pool1d(c(emb), c(emb).size(2)).squeeze(2) for c in self.convs]
|
| 72 |
+
return self.classifier(torch.cat(pooled, dim=1))
|
| 73 |
+
|
| 74 |
+
# ββ GNN Model βββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
GNN_AVAILABLE = False
|
| 76 |
+
try:
|
| 77 |
+
from torch_geometric.nn import SAGEConv, BatchNorm as GNNBatchNorm
|
| 78 |
+
class PhishingGNN(nn.Module):
|
| 79 |
+
def __init__(self, num_features, hidden_dim, num_classes, dropout=0.3):
|
| 80 |
+
super().__init__()
|
| 81 |
+
self.conv1 = SAGEConv(num_features, hidden_dim)
|
| 82 |
+
self.conv2 = SAGEConv(hidden_dim, hidden_dim*2)
|
| 83 |
+
self.conv3 = SAGEConv(hidden_dim*2, hidden_dim)
|
| 84 |
+
self.bn1 = GNNBatchNorm(hidden_dim)
|
| 85 |
+
self.bn2 = GNNBatchNorm(hidden_dim*2)
|
| 86 |
+
self.bn3 = GNNBatchNorm(hidden_dim)
|
| 87 |
+
self.cls = nn.Sequential(
|
| 88 |
+
nn.Linear(hidden_dim, 64), nn.ReLU(),
|
| 89 |
+
nn.Dropout(dropout), nn.Linear(64, num_classes)
|
| 90 |
+
)
|
| 91 |
+
self.drop = dropout
|
| 92 |
+
def forward(self, x, ei):
|
| 93 |
+
x = F.dropout(F.relu(self.bn1(self.conv1(x,ei))), p=self.drop, training=self.training)
|
| 94 |
+
x = F.dropout(F.relu(self.bn2(self.conv2(x,ei))), p=self.drop, training=self.training)
|
| 95 |
+
x = F.dropout(F.relu(self.bn3(self.conv3(x,ei))), p=self.drop, training=self.training)
|
| 96 |
+
return self.cls(x)
|
| 97 |
+
GNN_AVAILABLE = True
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"GNN not available: {e}")
|
| 100 |
+
|
| 101 |
+
# ββ Feature Functions ββββββββββββββββββββββββββββββββββ
|
| 102 |
+
def compute_entropy(text):
|
| 103 |
+
if not text: return 0.0
|
| 104 |
+
freq = [text.count(c)/len(text) for c in set(text)]
|
| 105 |
+
return -sum(p*np.log2(p+1e-10) for p in freq)
|
| 106 |
+
|
| 107 |
+
def min_typo_distance(domain):
|
| 108 |
+
if not domain: return 10
|
| 109 |
+
clean = domain.replace("www.", "")
|
| 110 |
+
return min(Levenshtein.distance(clean, d) for d in TOP_DOMAINS)
|
| 111 |
+
|
| 112 |
+
def is_ip(domain):
|
| 113 |
+
return bool(re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", domain))
|
| 114 |
+
|
| 115 |
+
def count_encoded(url):
|
| 116 |
+
return len(re.findall(r"%[0-9a-fA-F]{2}", url))
|
| 117 |
+
|
| 118 |
+
def extract_domain_name(url):
|
| 119 |
+
try:
|
| 120 |
+
parsed = urlparse(url if url.startswith("http") else "http://"+url)
|
| 121 |
+
parts = parsed.netloc.split(".")
|
| 122 |
+
return ".".join(parts[-2:]) if len(parts) >= 2 else parsed.netloc
|
| 123 |
+
except:
|
| 124 |
+
return url
|
| 125 |
+
|
| 126 |
+
def extract_features(url):
|
| 127 |
+
url = str(url)
|
| 128 |
+
try:
|
| 129 |
+
parsed = urlparse(url if url.startswith("http") else "http://"+url)
|
| 130 |
+
domain, path, query = parsed.netloc, parsed.path, parsed.query
|
| 131 |
+
except:
|
| 132 |
+
domain, path, query = url, "", ""
|
| 133 |
+
td = min_typo_distance(domain)
|
| 134 |
+
return np.array([
|
| 135 |
+
len(url), len(domain), len(path), len(query),
|
| 136 |
+
url.count("."), url.count("-"), url.count("/"),
|
| 137 |
+
url.count("@"), url.count("?"), url.count("="),
|
| 138 |
+
url.count("%"), sum(c.isdigit() for c in url),
|
| 139 |
+
len(domain.split("."))-1 if domain else 0,
|
| 140 |
+
1 if url.startswith("https") else 0,
|
| 141 |
+
1 if is_ip(domain) else 0,
|
| 142 |
+
1 if any(domain.endswith(t) for t in SUSPICIOUS_TLDS) else 0,
|
| 143 |
+
1 if any(b in url.lower() for b in BRAND_KEYWORDS) else 0,
|
| 144 |
+
compute_entropy(url),
|
| 145 |
+
sum(c.isdigit() for c in url)/max(len(url), 1),
|
| 146 |
+
len([p for p in path.split("/") if p]),
|
| 147 |
+
1 if td==1 else 0, 1 if td==2 else 0, td,
|
| 148 |
+
len(re.findall(r"[0-9]", domain)),
|
| 149 |
+
1 if "xn--" in domain else 0,
|
| 150 |
+
url.count("_"), count_encoded(url),
|
| 151 |
+
1 if re.search(r"\d{1,3}-\d{1,3}-\d{1,3}-\d{1,3}", domain) else 0,
|
| 152 |
+
len(domain.split(".")[-1]) if domain else 0,
|
| 153 |
+
sum(c.isupper() for c in url)/max(len(url), 1)
|
| 154 |
+
], dtype=np.float32)
|
| 155 |
+
|
| 156 |
+
def get_risk_level(prob):
|
| 157 |
+
if prob >= 0.70: return "HIGH"
|
| 158 |
+
elif prob >= 0.40: return "MEDIUM"
|
| 159 |
+
return "LOW"
|
| 160 |
+
|
| 161 |
+
def analyze_extra(url):
|
| 162 |
+
domain = extract_domain_name(url)
|
| 163 |
+
td = min_typo_distance(domain)
|
| 164 |
+
dists = {d: Levenshtein.distance(domain.replace("www.",""), d) for d in TOP_DOMAINS}
|
| 165 |
+
closest = min(dists, key=dists.get)
|
| 166 |
+
return {
|
| 167 |
+
"typosquatting_detected" : td <= 2,
|
| 168 |
+
"typo_distance" : int(td),
|
| 169 |
+
"closest_legitimate" : closest,
|
| 170 |
+
"homograph_detected" : "xn--" in domain,
|
| 171 |
+
"ip_as_domain" : is_ip(domain),
|
| 172 |
+
"suspicious_tld" : any(domain.endswith(t) for t in SUSPICIOUS_TLDS),
|
| 173 |
+
"brand_impersonation" : any(b in url.lower() for b in BRAND_KEYWORDS),
|
| 174 |
+
"url_entropy" : round(compute_entropy(url), 4),
|
| 175 |
+
"uses_https" : url.startswith("https"),
|
| 176 |
+
"url_encoded_chars" : count_encoded(url),
|
| 177 |
+
"domain" : domain
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
# ββ Model Setup ββββββββββββββββββββββββββββββββββββββββ
|
| 181 |
+
models = {}
|
| 182 |
+
|
| 183 |
+
def download_from_drive(file_id, dest_path):
|
| 184 |
+
if os.path.exists(dest_path):
|
| 185 |
+
print(f"Already exists: {dest_path}")
|
| 186 |
+
return True
|
| 187 |
+
url = f"https://drive.google.com/uc?export=download&id={file_id}&confirm=t"
|
| 188 |
+
print(f"Downloading to {dest_path}...")
|
| 189 |
+
try:
|
| 190 |
+
urllib.request.urlretrieve(url, dest_path)
|
| 191 |
+
print(f"Downloaded: {dest_path}")
|
| 192 |
+
return True
|
| 193 |
+
except Exception as e:
|
| 194 |
+
print(f"Failed: {e}")
|
| 195 |
+
return False
|
| 196 |
+
|
| 197 |
+
def extract_transformer(zip_path, target_path):
|
| 198 |
+
if os.path.exists(f"{target_path}/config.json"):
|
| 199 |
+
print(f"Already extracted: {target_path}")
|
| 200 |
+
return
|
| 201 |
+
tmp = f"/tmp/ext_{os.path.basename(target_path)}"
|
| 202 |
+
with zipfile.ZipFile(zip_path, "r") as z:
|
| 203 |
+
z.extractall(tmp)
|
| 204 |
+
for root, dirs, files in os.walk(tmp):
|
| 205 |
+
if "config.json" in files and "model.safetensors" in files:
|
| 206 |
+
if os.path.exists(target_path):
|
| 207 |
+
shutil.rmtree(target_path)
|
| 208 |
+
shutil.copytree(root, target_path)
|
| 209 |
+
print(f"Extracted: {target_path}")
|
| 210 |
+
return
|
| 211 |
+
|
| 212 |
+
def extract_pt(zip_path, pt_path):
|
| 213 |
+
if os.path.exists(pt_path):
|
| 214 |
+
print(f"Already extracted: {pt_path}")
|
| 215 |
+
return
|
| 216 |
+
tmp = f"/tmp/ext_{os.path.basename(pt_path)}"
|
| 217 |
+
os.makedirs(tmp, exist_ok=True)
|
| 218 |
+
with zipfile.ZipFile(zip_path, "r") as z:
|
| 219 |
+
z.extractall(tmp)
|
| 220 |
+
pt_name = os.path.basename(pt_path)
|
| 221 |
+
for root, dirs, files in os.walk(tmp):
|
| 222 |
+
if pt_name in files:
|
| 223 |
+
shutil.copy(f"{root}/{pt_name}", pt_path)
|
| 224 |
+
print(f"Extracted: {pt_path}")
|
| 225 |
+
return
|
| 226 |
+
|
| 227 |
+
def setup_models():
|
| 228 |
+
os.makedirs("./models/bert", exist_ok=True)
|
| 229 |
+
os.makedirs("./models/roberta", exist_ok=True)
|
| 230 |
+
|
| 231 |
+
ids = {
|
| 232 |
+
"bert_model.zip" : os.getenv("BERT_FILE_ID", ""),
|
| 233 |
+
"roberta_model.zip" : os.getenv("ROBERTA_FILE_ID", ""),
|
| 234 |
+
"cnn_model.zip" : os.getenv("CNN_FILE_ID", ""),
|
| 235 |
+
"gnn_model.zip" : os.getenv("GNN_FILE_ID", ""),
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
for fname, fid in ids.items():
|
| 239 |
+
if fid:
|
| 240 |
+
download_from_drive(fid, f"./models/{fname}")
|
| 241 |
+
|
| 242 |
+
if os.path.exists("./models/bert_model.zip"):
|
| 243 |
+
extract_transformer("./models/bert_model.zip", "./models/bert")
|
| 244 |
+
if os.path.exists("./models/roberta_model.zip"):
|
| 245 |
+
extract_transformer("./models/roberta_model.zip", "./models/roberta")
|
| 246 |
+
if os.path.exists("./models/cnn_model.zip"):
|
| 247 |
+
extract_pt("./models/cnn_model.zip", "./models/cnn_best.pt")
|
| 248 |
+
if os.path.exists("./models/gnn_model.zip"):
|
| 249 |
+
extract_pt("./models/gnn_model.zip", "./models/gnn_best.pt")
|
| 250 |
+
|
| 251 |
+
print("Model setup complete.")
|
| 252 |
+
|
| 253 |
+
setup_models()
|
| 254 |
+
|
| 255 |
+
# ββ Load Models ββββββββββββββββββββββββββββββββββββββββ
|
| 256 |
+
print(f"Loading models on {DEVICE}...")
|
| 257 |
+
|
| 258 |
+
try:
|
| 259 |
+
models["bert_tokenizer"] = BertTokenizer.from_pretrained("./models/bert")
|
| 260 |
+
models["bert"] = BertForSequenceClassification.from_pretrained("./models/bert").to(DEVICE).eval()
|
| 261 |
+
print("BERT loaded.")
|
| 262 |
+
except Exception as e: print(f"BERT failed: {e}")
|
| 263 |
+
|
| 264 |
+
try:
|
| 265 |
+
models["roberta_tokenizer"] = RobertaTokenizer.from_pretrained("./models/roberta")
|
| 266 |
+
models["roberta"] = RobertaForSequenceClassification.from_pretrained("./models/roberta").to(DEVICE).eval()
|
| 267 |
+
print("RoBERTa loaded.")
|
| 268 |
+
except Exception as e: print(f"RoBERTa failed: {e}")
|
| 269 |
+
|
| 270 |
+
try:
|
| 271 |
+
ckpt = torch.load("./models/cnn_best.pt", map_location=DEVICE, weights_only=False)
|
| 272 |
+
cnn = PhishingCNN(vocab_size=ckpt.get("vocab_size", VOCAB_SIZE))
|
| 273 |
+
cnn.load_state_dict(ckpt["model_state"])
|
| 274 |
+
models["cnn"] = cnn.to(DEVICE).eval()
|
| 275 |
+
models["char_to_idx"] = ckpt.get("char_to_idx", char_to_idx)
|
| 276 |
+
print("CNN loaded.")
|
| 277 |
+
except Exception as e: print(f"CNN failed: {e}")
|
| 278 |
+
|
| 279 |
+
try:
|
| 280 |
+
if GNN_AVAILABLE:
|
| 281 |
+
ckpt = torch.load("./models/gnn_best.pt", map_location=DEVICE, weights_only=False)
|
| 282 |
+
gnn = PhishingGNN(ckpt.get("num_features", NUM_FEATURES),
|
| 283 |
+
ckpt.get("hidden_dim", 128),
|
| 284 |
+
ckpt.get("num_classes", 2),
|
| 285 |
+
ckpt.get("dropout", 0.3))
|
| 286 |
+
gnn.load_state_dict(ckpt["model_state"])
|
| 287 |
+
models["gnn"] = gnn.to(DEVICE).eval()
|
| 288 |
+
models["scaler"] = ckpt["scaler"]
|
| 289 |
+
print("GNN loaded.")
|
| 290 |
+
except Exception as e: print(f"GNN failed: {e}")
|
| 291 |
+
|
| 292 |
+
try:
|
| 293 |
+
if "scaler" not in models:
|
| 294 |
+
with open("./models/scaler.pkl", "rb") as f:
|
| 295 |
+
models["scaler"] = pickle.load(f)
|
| 296 |
+
except: pass
|
| 297 |
+
|
| 298 |
+
try:
|
| 299 |
+
with open("./models/fusion_model.pkl", "rb") as f:
|
| 300 |
+
models["fusion"] = pickle.load(f)
|
| 301 |
+
print("Fusion loaded.")
|
| 302 |
+
except Exception as e: print(f"Fusion failed: {e}")
|
| 303 |
+
|
| 304 |
+
loaded = [k for k in models if not k.endswith("tokenizer") and not k.endswith("_to_idx")]
|
| 305 |
+
print(f"Models ready: {loaded}")
|
| 306 |
+
|
| 307 |
+
# ββ Prediction Functions βββββββββββββββββββββββββββββββ
|
| 308 |
+
def pb(url):
|
| 309 |
+
if "bert" not in models: return 0.5
|
| 310 |
+
try:
|
| 311 |
+
enc = models["bert_tokenizer"](url, add_special_tokens=True, max_length=MAX_LEN,
|
| 312 |
+
padding="max_length", truncation=True, return_tensors="pt")
|
| 313 |
+
with torch.no_grad():
|
| 314 |
+
return torch.softmax(models["bert"](
|
| 315 |
+
input_ids=enc["input_ids"].to(DEVICE),
|
| 316 |
+
attention_mask=enc["attention_mask"].to(DEVICE)
|
| 317 |
+
).logits, dim=1)[0][1].item()
|
| 318 |
+
except: return 0.5
|
| 319 |
+
|
| 320 |
+
def pr(url):
|
| 321 |
+
if "roberta" not in models: return 0.5
|
| 322 |
+
try:
|
| 323 |
+
enc = models["roberta_tokenizer"](url, add_special_tokens=True, max_length=MAX_LEN,
|
| 324 |
+
padding="max_length", truncation=True, return_tensors="pt")
|
| 325 |
+
with torch.no_grad():
|
| 326 |
+
return torch.softmax(models["roberta"](
|
| 327 |
+
input_ids=enc["input_ids"].to(DEVICE),
|
| 328 |
+
attention_mask=enc["attention_mask"].to(DEVICE)
|
| 329 |
+
).logits, dim=1)[0][1].item()
|
| 330 |
+
except: return 0.5
|
| 331 |
+
|
| 332 |
+
def pc(url):
|
| 333 |
+
if "cnn" not in models: return 0.5
|
| 334 |
+
try:
|
| 335 |
+
cidx = models.get("char_to_idx", char_to_idx)
|
| 336 |
+
enc = [cidx.get(c, 1) for c in str(url)[:MAX_URL_LEN]]
|
| 337 |
+
enc = enc + [0] * (MAX_URL_LEN - len(enc))
|
| 338 |
+
with torch.no_grad():
|
| 339 |
+
return torch.softmax(models["cnn"](
|
| 340 |
+
torch.tensor([enc], dtype=torch.long).to(DEVICE)
|
| 341 |
+
), dim=1)[0][1].item()
|
| 342 |
+
except: return 0.5
|
| 343 |
+
|
| 344 |
+
def pg(url):
|
| 345 |
+
if "gnn" not in models or "scaler" not in models: return 0.5
|
| 346 |
+
try:
|
| 347 |
+
f = models["scaler"].transform(extract_features(url).reshape(1, -1))
|
| 348 |
+
x = torch.tensor(f, dtype=torch.float).to(DEVICE)
|
| 349 |
+
ei = torch.tensor([[0], [0]], dtype=torch.long).to(DEVICE)
|
| 350 |
+
with torch.no_grad():
|
| 351 |
+
return torch.softmax(models["gnn"](x, ei), dim=1)[0][1].item()
|
| 352 |
+
except: return 0.5
|
| 353 |
+
|
| 354 |
+
def pf(b, r, c, g):
|
| 355 |
+
if "fusion" not in models: return float(np.mean([b, r, c, g]))
|
| 356 |
+
try: return float(models["fusion"].predict_proba(np.array([[b, r, c, g]]))[0][1])
|
| 357 |
+
except: return float(np.mean([b, r, c, g]))
|
| 358 |
+
|
| 359 |
+
# ββ Request Models βββββββββββββββββββββββββββββββββββββ
|
| 360 |
+
class ScanRequest(BaseModel):
|
| 361 |
+
url: str
|
| 362 |
+
|
| 363 |
+
class FeedbackRequest(BaseModel):
|
| 364 |
+
url: str
|
| 365 |
+
is_phishing: bool
|
| 366 |
+
user_comment: Optional[str] = ""
|
| 367 |
+
|
| 368 |
+
class BulkScanRequest(BaseModel):
|
| 369 |
+
urls: List[str]
|
| 370 |
+
|
| 371 |
+
# ββ Endpoints ββββββββββββββββββββββββββββββββββββββββββ
|
| 372 |
+
@app.get("/")
|
| 373 |
+
def root():
|
| 374 |
+
loaded = [k for k in models if not k.endswith("tokenizer") and not k.endswith("_to_idx")]
|
| 375 |
+
return {"message": "AdaptiveShield API", "status": "running",
|
| 376 |
+
"models": loaded, "device": str(DEVICE)}
|
| 377 |
+
|
| 378 |
+
@app.get("/health")
|
| 379 |
+
def health():
|
| 380 |
+
loaded = [k for k in models if not k.endswith("tokenizer") and not k.endswith("_to_idx")]
|
| 381 |
+
return {"status": "healthy", "models_loaded": loaded,
|
| 382 |
+
"timestamp": datetime.now().isoformat()}
|
| 383 |
+
|
| 384 |
+
@app.post("/scan")
|
| 385 |
+
def scan_url(request: ScanRequest):
|
| 386 |
+
url = request.url.strip()
|
| 387 |
+
if not url: raise HTTPException(status_code=400, detail="URL cannot be empty.")
|
| 388 |
+
start = time.time()
|
| 389 |
+
b,r,c,g = pb(url), pr(url), pc(url), pg(url)
|
| 390 |
+
fp = pf(b, r, c, g)
|
| 391 |
+
extra = analyze_extra(url)
|
| 392 |
+
boost = 0.0
|
| 393 |
+
if extra["typosquatting_detected"] and extra["typo_distance"] == 1: boost += 0.10
|
| 394 |
+
if extra["ip_as_domain"]: boost += 0.15
|
| 395 |
+
if extra["homograph_detected"]: boost += 0.10
|
| 396 |
+
if extra["suspicious_tld"] and extra["brand_impersonation"]: boost += 0.08
|
| 397 |
+
final = min(1.0, fp + boost)
|
| 398 |
+
result = {
|
| 399 |
+
"url" : url,
|
| 400 |
+
"label" : "PHISHING" if final >= 0.5 else "LEGITIMATE",
|
| 401 |
+
"phishing_probability": round(final * 100, 2),
|
| 402 |
+
"risk_level" : get_risk_level(final),
|
| 403 |
+
"model_scores" : {
|
| 404 |
+
"bert": round(b*100,2), "roberta": round(r*100,2),
|
| 405 |
+
"cnn" : round(c*100,2), "gnn" : round(g*100,2),
|
| 406 |
+
"fusion": round(fp*100,2), "final": round(final*100,2)
|
| 407 |
+
},
|
| 408 |
+
"extra_analysis" : extra,
|
| 409 |
+
"scan_time_ms" : round((time.time()-start)*1000, 2),
|
| 410 |
+
"timestamp" : datetime.now().isoformat()
|
| 411 |
+
}
|
| 412 |
+
scan_history.append(result)
|
| 413 |
+
return result
|
| 414 |
+
|
| 415 |
+
@app.post("/scan/bulk")
|
| 416 |
+
def scan_bulk(request: BulkScanRequest):
|
| 417 |
+
if len(request.urls) > 50:
|
| 418 |
+
raise HTTPException(status_code=400, detail="Max 50 URLs.")
|
| 419 |
+
results = []; ph = 0
|
| 420 |
+
for url in request.urls:
|
| 421 |
+
try:
|
| 422 |
+
res = scan_url(ScanRequest(url=url))
|
| 423 |
+
results.append(res)
|
| 424 |
+
ph += 1 if res.get("label") == "PHISHING" else 0
|
| 425 |
+
except Exception as e:
|
| 426 |
+
results.append({"url": url, "error": str(e)})
|
| 427 |
+
return {"total_scanned": len(results), "phishing_found": ph,
|
| 428 |
+
"legitimate_found": len(results)-ph, "results": results}
|
| 429 |
+
|
| 430 |
+
@app.post("/feedback")
|
| 431 |
+
def feedback(request: FeedbackRequest):
|
| 432 |
+
feedback_store.append({"url": request.url, "is_phishing": request.is_phishing,
|
| 433 |
+
"comment": request.user_comment,
|
| 434 |
+
"timestamp": datetime.now().isoformat()})
|
| 435 |
+
return {"message": "Feedback received.", "total_feedback": len(feedback_store)}
|
| 436 |
+
|
| 437 |
+
@app.get("/history")
|
| 438 |
+
def history(limit: int = 20):
|
| 439 |
+
return {"total_scans": len(scan_history), "results": scan_history[-limit:]}
|
| 440 |
+
|
| 441 |
+
@app.get("/stats")
|
| 442 |
+
def stats():
|
| 443 |
+
if not scan_history: return {"message": "No scans yet."}
|
| 444 |
+
total = len(scan_history)
|
| 445 |
+
ph = sum(1 for s in scan_history if s.get("label") == "PHISHING")
|
| 446 |
+
return {"total_scans": total, "phishing_detected": ph,
|
| 447 |
+
"legitimate_detected": total-ph,
|
| 448 |
+
"phishing_rate_percent": round(ph/total*100, 2),
|
| 449 |
+
"average_scan_time_ms": round(np.mean([s.get("scan_time_ms",0) for s in scan_history]), 2)}
|
| 450 |
+
```
|
| 451 |
+
|
| 452 |
+
Click **Commit changes to main**.
|
| 453 |
+
|
| 454 |
+
---
|
| 455 |
+
|
| 456 |
+
## STEP 5 : Create requirements.txt
|
| 457 |
+
|
| 458 |
+
Click **Add file**. Click **Create new file**. Name it **requirements.txt**. Paste this.
|
| 459 |
+
```
|
| 460 |
+
fastapi==0.111.0
|
| 461 |
+
uvicorn==0.30.1
|
| 462 |
+
torch==2.1.0
|
| 463 |
+
transformers==4.44.0
|
| 464 |
+
tokenizers==0.19.1
|
| 465 |
+
torch_geometric
|
| 466 |
+
scikit-learn>=1.3.0
|
| 467 |
+
numpy>=1.24.0
|
| 468 |
+
python-Levenshtein==0.25.1
|
| 469 |
+
pydantic>=2.0.0
|
| 470 |
+
python-multipart==0.0.9
|