|
--- |
|
license: mit |
|
library_name: sklearn |
|
tags: |
|
- tabular-classification |
|
- sklearn |
|
- phishing |
|
- onnx |
|
model_format: pickle |
|
model_file: model.pkl |
|
widget: |
|
- structuredData: |
|
domain_age: |
|
- 11039.0 |
|
- -1.0 |
|
- 5636.0 |
|
domain_registration_length: |
|
- 3571.0 |
|
- 0.0 |
|
- 208.0 |
|
google_index: |
|
- 0.0 |
|
- 0.0 |
|
- 0.0 |
|
nb_hyperlinks: |
|
- 97.0 |
|
- 168.0 |
|
- 52.0 |
|
page_rank: |
|
- 5.0 |
|
- 2.0 |
|
- 10.0 |
|
ratio_extHyperlinks: |
|
- 0.030927835 |
|
- 0.220238095 |
|
- 0.442307692 |
|
ratio_extRedirection: |
|
- 0.0 |
|
- 0.378378378 |
|
- 0.0 |
|
ratio_intHyperlinks: |
|
- 0.969072165 |
|
- 0.779761905 |
|
- 0.557692308 |
|
safe_anchor: |
|
- 25.0 |
|
- 24.32432432 |
|
- 0.0 |
|
status: |
|
- legitimate |
|
- legitimate |
|
- legitimate |
|
web_traffic: |
|
- 178542.0 |
|
- 0.0 |
|
- 2.0 |
|
inference: false |
|
pipeline_tag: tabular-classification |
|
--- |
|
|
|
# Model Description |
|
|
|
|
|
The model predicts the probability that a URL is a phishing site using a list of features. |
|
|
|
- **Model type:** Traditional machine learning |
|
- **Task:** Tabular classification (Binary) |
|
- **License:**: MIT |
|
- **Repository:** https://github.com/pirocheto/phishing-url-detection |
|
|
|
|
|
## Evaluation |
|
|
|
| Metric | Value | |
|
|-----------|----------| |
|
| accuracy | 0.945652 | |
|
| f1-score | 0.945114 | |
|
| precision | 0.951996 | |
|
| recall | 0.938331 | |
|
|
|
# How to Get Started with the Model |
|
|
|
|
|
Using pickle in Python is discouraged due to security risks during data deserialization, potentially allowing code injection. |
|
It lacks portability across Python versions and interoperability with other languages. |
|
|
|
Instead, we recommend using the ONNX model, which is more secure. |
|
It is half the size and almost twice as fast compared to the pickle model. |
|
Additionally, it can be utilized by languages supported by the [ONNX runtime](https://onnxruntime.ai/docs/get-started/) (see below for an example using JavaScript). |
|
|
|
|
|
## With ONNX (recommanded) |
|
|
|
### Python |
|
|
|
```python |
|
import onnxruntime |
|
import pandas as pd |
|
from huggingface_hub import hf_hub_download |
|
|
|
REPO_ID = "pirocheto/phishing-url-detection" |
|
FILENAME = "model.onnx" |
|
|
|
# Initializing the ONNX Runtime session with the pre-trained model |
|
sess = onnxruntime.InferenceSession( |
|
hf_hub_download(repo_id=REPO_ID, filename=FILENAME), |
|
providers=["CPUExecutionProvider"], |
|
) |
|
|
|
# Defining a list of URLs with characteristics |
|
data = [ |
|
{ |
|
"url": "https://www.rga.com/about/workplace", |
|
"nb_hyperlinks": 97, |
|
"ratio_intHyperlinks": 0.969072165, |
|
"ratio_extHyperlinks": 0.030927835, |
|
"ratio_extRedirection": 0, |
|
"safe_anchor": 25, |
|
"domain_registration_length": 3571, |
|
"domain_age": 11039, |
|
"web_traffic": 178542, |
|
"google_index": 0, |
|
"page_rank": 5, |
|
}, |
|
] |
|
|
|
# Converting data to a float32 NumPy array |
|
df = pd.DataFrame(data).set_index("url") |
|
inputs = df.to_numpy(dtype="float32") |
|
|
|
# Using the ONNX model to make predictions on the input data |
|
probas = sess.run(None, {"X": inputs})[1] |
|
|
|
# Displaying the results |
|
for url, proba in zip(data, probas): |
|
print(f"URL: {url['url']}") |
|
print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%") |
|
print("----") |
|
|
|
# Expected output: |
|
# URL: https://www.rga.com/about/workplace |
|
# Likelihood of being a phishing site: 0.89% |
|
# ---- |
|
|
|
``` |
|
|
|
### JavaScript |
|
|
|
```javascript |
|
const ort = require('onnxruntime-node'); |
|
|
|
const data = [ |
|
{ |
|
"url": "http://rapidpaws.com/wp-content/we_transfer/index2.php?email=/", |
|
"nb_hyperlinks": 1, |
|
"ratio_intHyperlinks": 1, |
|
"ratio_extHyperlinks": 0, |
|
"ratio_extRedirection": 0, |
|
"safe_anchor": 0, |
|
"domain_registration_length": 338, |
|
"domain_age": 0, |
|
"web_traffic":1853, |
|
"google_index": 1, |
|
"page_rank": 2, |
|
}, |
|
]; |
|
|
|
async function main() { |
|
try { |
|
// Make sure you have downloaded the model.onnx |
|
// Creating an ONNX inference session with the specified model |
|
const model_path = "./models/model.onnx"; |
|
const session = await ort.InferenceSession.create(model_path); |
|
|
|
// Creating an ONNX tensor from the input data |
|
const inputs = data.map(url => Object.values(url).slice(1)); |
|
const flattenInputs = inputs.flat(); |
|
const tensor = new ort.Tensor('float32', flattenInputs, [inputs.length, 10]); |
|
|
|
// Executing the inference session with the input tensor |
|
const results = await session.run({"X": tensor}); |
|
const probas = results['probabilities'].data; |
|
|
|
// Displaying results for each URL |
|
data.forEach((url, index) => { |
|
const proba = probas[index * 2 + 1]; |
|
const percent = (proba * 100).toFixed(2); |
|
|
|
console.log(`URL: ${url.url}`); |
|
console.log(`Likelihood of being a phishing site: ${percent}%`); |
|
console.log("----"); |
|
}); |
|
|
|
|
|
} catch (e) { |
|
console.log(`failed to inference ONNX model: ${e}.`); |
|
} |
|
}; |
|
|
|
main(); |
|
|
|
// Expected output: |
|
// URL: https://www.rga.com/about/workplace |
|
// Likelihood of being a phishing site: 0.89% |
|
// ---- |
|
|
|
``` |
|
|