GitHub Action
update model
9a6c63e
|
raw
history blame
15.3 kB
metadata
license: mit
library_name: sklearn
tags:
  - classification
  - phishing

Model description

Training Procedure

Hyperparameters

Click to expand
Hyperparameter Value
memory
steps [('standardscaler', StandardScaler()), ('calibratedclassifiercv', CalibratedClassifierCV(cv=5, estimator=RandomForestClassifier(),
method='isotonic'))]
verbose False
standardscaler StandardScaler()
calibratedclassifiercv CalibratedClassifierCV(cv=5, estimator=RandomForestClassifier(),
method='isotonic')
standardscaler__copy True
standardscaler__with_mean True
standardscaler__with_std True
calibratedclassifiercv__base_estimator deprecated
calibratedclassifiercv__cv 5
calibratedclassifiercv__ensemble True
calibratedclassifiercv__estimator__bootstrap True
calibratedclassifiercv__estimator__ccp_alpha 0.0
calibratedclassifiercv__estimator__class_weight
calibratedclassifiercv__estimator__criterion gini
calibratedclassifiercv__estimator__max_depth
calibratedclassifiercv__estimator__max_features sqrt
calibratedclassifiercv__estimator__max_leaf_nodes
calibratedclassifiercv__estimator__max_samples
calibratedclassifiercv__estimator__min_impurity_decrease 0.0
calibratedclassifiercv__estimator__min_samples_leaf 1
calibratedclassifiercv__estimator__min_samples_split 2
calibratedclassifiercv__estimator__min_weight_fraction_leaf 0.0
calibratedclassifiercv__estimator__n_estimators 100
calibratedclassifiercv__estimator__n_jobs
calibratedclassifiercv__estimator__oob_score False
calibratedclassifiercv__estimator__random_state
calibratedclassifiercv__estimator__verbose 0
calibratedclassifiercv__estimator__warm_start False
calibratedclassifiercv__estimator RandomForestClassifier()
calibratedclassifiercv__method isotonic
calibratedclassifiercv__n_jobs

Model Plot

This is the architecture of the model loaded by joblib.

Pipeline(steps=[('standardscaler', StandardScaler()),('calibratedclassifiercv',CalibratedClassifierCV(cv=5,estimator=RandomForestClassifier(),method='isotonic'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Evaluation Results

Metric Value
accuracy 0.945652
f1-score 0.945114
precision 0.951996
recall 0.938331

Test Report

Test Report

Model Interpretation

Feature Importances

Test Report

How to Get Started with the Model

Below are some code snippets to load the model.

With joblib (not recommended)

import joblib
import pandas as pd

urls = [
    {
        "url": "https://www.rga.com/about/workplace",
        "nb_hyperlinks": 97.0,
        "ratio_intHyperlinks": 0.969072165,
        "ratio_extHyperlinks": 0.030927835,
        "ratio_extRedirection": 0.0,
        "safe_anchor": 25.0,
        "domain_registration_length": 3571.0,
        "domain_age": 11039,
        "web_traffic": 178542.0,
        "google_index": 0.0,
        "page_rank": 5,
    },
]


model = joblib.load("models/model.pkl")

df = pd.DataFrame(urls)
df = df.set_index("url")

probas = model.predict_proba(df.values)

for url, proba in zip(urls, probas):
    print(f"URL: {url['url']}")
    print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
    print("----")


# output:
# URL: https://www.rga.com/about/workplace
# Likelihood of being a phishing site: 0.89%
# ----
  

With ONNX (recommended)

Python

import numpy as np
import onnxruntime
import pandas as pd

# Defining a list of URLs with characteristics
urls = [
    {
        "url": "https://www.rga.com/about/workplace",
        "nb_hyperlinks": 97.0,
        "ratio_intHyperlinks": 0.969072165,
        "ratio_extHyperlinks": 0.030927835,
        "ratio_extRedirection": 0.0,
        "safe_anchor": 25.0,
        "domain_registration_length": 3571.0,
        "domain_age": 11039,
        "web_traffic": 178542.0,
        "google_index": 0.0,
        "page_rank": 5,
    },
]

# Initializing the ONNX Runtime session with the pre-trained model
sess = onnxruntime.InferenceSession(
    "models/model.onnx",
    providers=["CPUExecutionProvider"],
)

# Creating a DataFrame from the list of URLs
df = pd.DataFrame(urls)
df = df.set_index("url")

# Converting DataFrame data to a float32 NumPy array
inputs = df.astype(np.float32).to_numpy()


# Using the ONNX model to make predictions on the input data
probas = sess.run(None, {"X": inputs})[1]


# Displaying the results
for url, proba in zip(urls, probas):
    print(proba)
    print(f"URL: {url['url']}")
    print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
    print("----")

# output:
# URL: https://www.rga.com/about/workplace
# Likelihood of being a phishing site: 0.89%
# ----
  

JavaScript

const ort = require('onnxruntime-node');

const urls = [
    {
        "url": "http://rapidpaws.com/wp-content/we_transfer/index2.php?email=/",
        "nb_hyperlinks": 1,
        "ratio_intHyperlinks": 1,
        "ratio_extHyperlinks": 0,
        "ratio_extRedirection": 0,
        "safe_anchor": 0,
        "domain_registration_length": 338,
        "domain_age": 0,
        "web_traffic":1853,
        "google_index": 1,
        "page_rank": 2,
    },
];

async function main() {
    try {

        // Creating an ONNX inference session with the specified model
        const model_path = "./models/model.onnx";
        const session = await ort.InferenceSession.create(model_path);
        
        // Get values from data and remove url links
        const inputs = urls.map(url => Object.values(url).slice(1));
        
        // Flattening the 2D array to get a 1D array
        const flattenInputs = inputs.flat();
        
        // Creating an ONNX tensor from the input array
        const tensor = new ort.Tensor('float32', flattenInputs, [inputs.length, 10]);
        
        // Executing the inference session with the input tensor
        const results = await session.run({"X": tensor});
        
        // Retrieving probability data from the results
        const probas = results['probabilities'].data;
        
        // Displaying results for each URL
        urls.forEach((url, index) => {
            // The index * 2 + 1 is used to access the probability associated with the phishing class
            const proba = probas[index * 2 + 1];
            const percent = (proba * 100).toFixed(2);
            
            console.log(`URL: ${url.url}`);
            console.log(`Likelihood of being a phishing site: ${percent}%`);
            console.log("----");
        });
        

    } catch (e) {
        console.log(`failed to inference ONNX model: ${e}.`);
    }
};

main();

// output:
// URL: https://www.rga.com/about/workplace
// Likelihood of being a phishing site: 0.89%
// ----