import os os.system("pip install -q torch transformers huggingface_hub dnspython python-whois bs4 requests pandas pyOpenSSL lightgbm") from gradio import Markdown, Textbox, Slider, Button, Blocks, Row, Column, JSON import numpy as np import lightgbm as lgb import pandas as pd from themes import aistrova from huggingface_hub import hf_hub_download from feature_extraction import existing_isssuers, extract_features hf_hub_download(repo_id="FredZhang7/malphish-eater-v1", filename='phishing_model_combined_0.984_train.txt', local_dir="./") model = lgb.Booster(model_file="phishing_model_combined_0.984_train.txt") def check(website_urls: str, confidence_threshold: float) -> tuple[dict, dict]: website_urls = [url.strip() for url in website_urls.split("\n")] if website_urls == []: return {}, "Benign" df = extract_features(website_urls) if len(df) == 0: return {}, {"Could not be reached": website_urls} df["issuer"] = df["issuer"].astype("category") df["page_rank_decimal"] = df["page_rank_decimal"].astype(float) df = pd.get_dummies(df, columns=['issuer']) if not 'issuer_other' in df.columns: df['issuer_other'] = 0 for issuer in existing_isssuers: if f"issuer_{issuer}" not in df.columns: df[f"issuer_{issuer}"] = 0 predictions = model.predict(df) malicious_urls = {} benign_urls = {} df_dict = {} for i, url in enumerate(website_urls): if predictions[i] > confidence_threshold: malicious_urls[url] = float(predictions[i]) else: benign_urls[url] = float(1 - predictions[i]) df_dict[url] = {} for col in df.columns: df_dict[url][col] = float(df.loc[i, col]) return df_dict, {"Malicious": malicious_urls, "Benign": benign_urls} with Blocks(theme=aistrova) as app: with Row(): with Column(scale=12): Markdown("

Malicious Websites Classification v1.0

") Markdown("✅ A demo for an attempt at detecting malicious websites (only works well for URLs in the dataset) [[Open-source Model (apache 2.0)](https://huggingface.co/FredZhang7/malphish-eater-v1)] [[Dataset & Analysis (apache 2.0)](https://huggingface.co/datasets/FredZhang7/malicious-website-features-2.4M)]

" + \ """✅ Trained on **2,430,000+** URLs and **37,350,000+** features; Achieved **98.4%+ accuracy** on training and validation sets.
""") with Column(scale=0.75): Markdown(" ") with Row(): toggle_light = Button(value="🌤️ Light Mode",variant="secondary") with Row(): toggle_dark = Button(value="🌒 Dark Mode", variant="secondary") toggle_light.click( None, _js="""() => { document.body.classList.remove('dark'); }""", ) toggle_dark.click( None, _js="""() => { document.body.classList.add('dark'); }""", ) with Row().style(equal_height=True): with Column(): website_urls = Textbox(label='Website URLs', info='Enter URL(s), separated by newlines, to check if any of them is malicious.', lines=6, max_lines=100) confidence_threshold = Slider(label='Confidence Level', info='🎯 Only classify a URL as malicious with a confidence above this threshold', value=0.50, minimum=0.50, maximum=1.0, step=0.01) with Row(): clear_button = Button(label='Clear') check_button = Button(label='Submit', variant='primary') with Column(): json = JSON(label='Extracted Features') predictions = JSON(label='Predictions') check_button.click(check, inputs=[website_urls, confidence_threshold], outputs=[json, predictions], api_name='predict') clear_button.click(lambda: ["", None, None], outputs=[website_urls, json, predictions], api_name=False) app.queue(concurrency_count=1, max_size=10) app.launch()