|
|
|
|
|
|
|
|
|
|
|
import joblib
|
|
|
import re
|
|
|
from urllib.parse import urlparse
|
|
|
import tldextract
|
|
|
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = joblib.load("test_model.joblib")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_url_features(url):
|
|
|
parsed = urlparse(url)
|
|
|
ext = tldextract.extract(url)
|
|
|
return {
|
|
|
"url_length": len(url),
|
|
|
"num_dots": url.count("."),
|
|
|
"has_ip": bool(re.search(r"\d+\.\d+\.\d+\.\d+", url)),
|
|
|
"https": parsed.scheme == "https",
|
|
|
"domain_length": len(ext.domain)
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_pdf_text(pdf_path):
|
|
|
text = ""
|
|
|
reader = PdfReader(pdf_path)
|
|
|
for page in reader.pages:
|
|
|
text += page.extract_text() or ""
|
|
|
return text[:500]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def predict(data):
|
|
|
"""
|
|
|
Expects JSON input:
|
|
|
{"inputs": {"text": "...", "url": "...", "pdf_path": "..."}}
|
|
|
pdf_path is optional if sending a PDF file
|
|
|
"""
|
|
|
text = data["inputs"].get("text", "")
|
|
|
url = data["inputs"].get("url", "")
|
|
|
pdf_path = data["inputs"].get("pdf_path", "")
|
|
|
|
|
|
|
|
|
url_features = extract_url_features(url) if url else {}
|
|
|
|
|
|
|
|
|
pdf_text = extract_pdf_text(pdf_path) if pdf_path else ""
|
|
|
|
|
|
|
|
|
combined_text = text + " " + pdf_text
|
|
|
|
|
|
|
|
|
pred = model.predict([combined_text])[0]
|
|
|
prob = model.predict_proba([combined_text])[0][1]
|
|
|
|
|
|
return {
|
|
|
"prediction": int(pred),
|
|
|
"probability": float(prob),
|
|
|
"url_features": url_features,
|
|
|
"pdf_text_sample": pdf_text[:100]
|
|
|
}
|
|
|
|