GitHub Action
commited on
Commit
·
672fe2d
1
Parent(s):
0652382
commit from github
Browse files- README.md +24 -137
- config.json +2 -1
- model.onnx +2 -2
- model.pkl +2 -2
README.md
CHANGED
@@ -5,6 +5,7 @@ tags:
|
|
5 |
- tabular-classification
|
6 |
- sklearn
|
7 |
- phishing
|
|
|
8 |
model_format: pickle
|
9 |
model_file: model.pkl
|
10 |
widget:
|
@@ -53,6 +54,7 @@ widget:
|
|
53 |
- 178542.0
|
54 |
- 0.0
|
55 |
- 2.0
|
|
|
56 |
pipeline_tag: tabular-classification
|
57 |
---
|
58 |
|
@@ -113,172 +115,57 @@ This is the architecture of the model loaded by joblib.
|
|
113 |
|
114 |
Below are some code snippets to load the model.
|
115 |
|
116 |
-
## With joblib (not recommended)
|
117 |
-
|
118 |
-
```python
|
119 |
-
import joblib
|
120 |
-
import pandas as pd
|
121 |
-
|
122 |
-
urls = [
|
123 |
-
{
|
124 |
-
"url": "https://www.rga.com/about/workplace",
|
125 |
-
"nb_hyperlinks": 97.0,
|
126 |
-
"ratio_intHyperlinks": 0.969072165,
|
127 |
-
"ratio_extHyperlinks": 0.030927835,
|
128 |
-
"ratio_extRedirection": 0.0,
|
129 |
-
"safe_anchor": 25.0,
|
130 |
-
"domain_registration_length": 3571.0,
|
131 |
-
"domain_age": 11039,
|
132 |
-
"web_traffic": 178542.0,
|
133 |
-
"google_index": 0.0,
|
134 |
-
"page_rank": 5,
|
135 |
-
},
|
136 |
-
]
|
137 |
-
|
138 |
-
|
139 |
-
model = joblib.load("models/model.pkl")
|
140 |
-
|
141 |
-
df = pd.DataFrame(urls)
|
142 |
-
df = df.set_index("url")
|
143 |
-
|
144 |
-
probas = model.predict_proba(df.values)
|
145 |
-
|
146 |
-
for url, proba in zip(urls, probas):
|
147 |
-
print(f"URL: {url['url']}")
|
148 |
-
print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
|
149 |
-
print("----")
|
150 |
-
|
151 |
-
|
152 |
-
# output:
|
153 |
-
# URL: https://www.rga.com/about/workplace
|
154 |
-
# Likelihood of being a phishing site: 0.89%
|
155 |
-
# ----
|
156 |
-
|
157 |
-
```
|
158 |
-
|
159 |
## With ONNX (recommended)
|
160 |
|
161 |
### Python
|
162 |
|
163 |
```python
|
164 |
-
import numpy as np
|
165 |
import onnxruntime
|
166 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
# Defining a list of URLs with characteristics
|
169 |
-
|
170 |
{
|
171 |
"url": "https://www.rga.com/about/workplace",
|
172 |
-
"nb_hyperlinks": 97
|
173 |
"ratio_intHyperlinks": 0.969072165,
|
174 |
"ratio_extHyperlinks": 0.030927835,
|
175 |
-
"ratio_extRedirection": 0
|
176 |
-
"safe_anchor": 25
|
177 |
-
"domain_registration_length": 3571
|
178 |
"domain_age": 11039,
|
179 |
-
"web_traffic": 178542
|
180 |
-
"google_index": 0
|
181 |
"page_rank": 5,
|
182 |
},
|
183 |
]
|
184 |
|
185 |
-
#
|
186 |
-
|
187 |
-
|
188 |
-
providers=["CPUExecutionProvider"],
|
189 |
-
)
|
190 |
-
|
191 |
-
# Creating a DataFrame from the list of URLs
|
192 |
-
df = pd.DataFrame(urls)
|
193 |
-
df = df.set_index("url")
|
194 |
-
|
195 |
-
# Converting DataFrame data to a float32 NumPy array
|
196 |
-
inputs = df.astype(np.float32).to_numpy()
|
197 |
-
|
198 |
|
199 |
# Using the ONNX model to make predictions on the input data
|
200 |
probas = sess.run(None, {"X": inputs})[1]
|
201 |
|
202 |
-
|
203 |
# Displaying the results
|
204 |
-
for url, proba in zip(
|
205 |
-
print(proba)
|
206 |
print(f"URL: {url['url']}")
|
207 |
print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
|
208 |
print("----")
|
209 |
|
210 |
-
#
|
211 |
# URL: https://www.rga.com/about/workplace
|
212 |
# Likelihood of being a phishing site: 0.89%
|
213 |
# ----
|
214 |
|
215 |
```
|
216 |
-
|
217 |
-
### JavaScript
|
218 |
-
|
219 |
-
```javascript
|
220 |
-
const ort = require('onnxruntime-node');
|
221 |
-
|
222 |
-
const urls = [
|
223 |
-
{
|
224 |
-
"url": "http://rapidpaws.com/wp-content/we_transfer/index2.php?email=/",
|
225 |
-
"nb_hyperlinks": 1,
|
226 |
-
"ratio_intHyperlinks": 1,
|
227 |
-
"ratio_extHyperlinks": 0,
|
228 |
-
"ratio_extRedirection": 0,
|
229 |
-
"safe_anchor": 0,
|
230 |
-
"domain_registration_length": 338,
|
231 |
-
"domain_age": 0,
|
232 |
-
"web_traffic":1853,
|
233 |
-
"google_index": 1,
|
234 |
-
"page_rank": 2,
|
235 |
-
},
|
236 |
-
];
|
237 |
-
|
238 |
-
async function main() {
|
239 |
-
try {
|
240 |
-
|
241 |
-
// Creating an ONNX inference session with the specified model
|
242 |
-
const model_path = "./models/model.onnx";
|
243 |
-
const session = await ort.InferenceSession.create(model_path);
|
244 |
-
|
245 |
-
// Get values from data and remove url links
|
246 |
-
const inputs = urls.map(url => Object.values(url).slice(1));
|
247 |
-
|
248 |
-
// Flattening the 2D array to get a 1D array
|
249 |
-
const flattenInputs = inputs.flat();
|
250 |
-
|
251 |
-
// Creating an ONNX tensor from the input array
|
252 |
-
const tensor = new ort.Tensor('float32', flattenInputs, [inputs.length, 10]);
|
253 |
-
|
254 |
-
// Executing the inference session with the input tensor
|
255 |
-
const results = await session.run({"X": tensor});
|
256 |
-
|
257 |
-
// Retrieving probability data from the results
|
258 |
-
const probas = results['probabilities'].data;
|
259 |
-
|
260 |
-
// Displaying results for each URL
|
261 |
-
urls.forEach((url, index) => {
|
262 |
-
// The index * 2 + 1 is used to access the probability associated with the phishing class
|
263 |
-
const proba = probas[index * 2 + 1];
|
264 |
-
const percent = (proba * 100).toFixed(2);
|
265 |
-
|
266 |
-
console.log(`URL: ${url.url}`);
|
267 |
-
console.log(`Likelihood of being a phishing site: ${percent}%`);
|
268 |
-
console.log("----");
|
269 |
-
});
|
270 |
-
|
271 |
-
|
272 |
-
} catch (e) {
|
273 |
-
console.log(`failed to inference ONNX model: ${e}.`);
|
274 |
-
}
|
275 |
-
};
|
276 |
-
|
277 |
-
main();
|
278 |
-
|
279 |
-
// output:
|
280 |
-
// URL: https://www.rga.com/about/workplace
|
281 |
-
// Likelihood of being a phishing site: 0.89%
|
282 |
-
// ----
|
283 |
-
|
284 |
-
```
|
|
|
5 |
- tabular-classification
|
6 |
- sklearn
|
7 |
- phishing
|
8 |
+
- onnx
|
9 |
model_format: pickle
|
10 |
model_file: model.pkl
|
11 |
widget:
|
|
|
54 |
- 178542.0
|
55 |
- 0.0
|
56 |
- 2.0
|
57 |
+
inference: false
|
58 |
pipeline_tag: tabular-classification
|
59 |
---
|
60 |
|
|
|
115 |
|
116 |
Below are some code snippets to load the model.
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
## With ONNX (recommended)
|
119 |
|
120 |
### Python
|
121 |
|
122 |
```python
|
|
|
123 |
import onnxruntime
|
124 |
import pandas as pd
|
125 |
+
from huggingface_hub import hf_hub_download
|
126 |
+
|
127 |
+
REPO_ID = "pirocheto/phishing-url-detection"
|
128 |
+
FILENAME = "model.onnx"
|
129 |
+
|
130 |
+
# Initializing the ONNX Runtime session with the pre-trained model
|
131 |
+
sess = onnxruntime.InferenceSession(
|
132 |
+
hf_hub_download(repo_id=REPO_ID, filename=FILENAME),
|
133 |
+
providers=["CPUExecutionProvider"],
|
134 |
+
)
|
135 |
|
136 |
# Defining a list of URLs with characteristics
|
137 |
+
data = [
|
138 |
{
|
139 |
"url": "https://www.rga.com/about/workplace",
|
140 |
+
"nb_hyperlinks": 97,
|
141 |
"ratio_intHyperlinks": 0.969072165,
|
142 |
"ratio_extHyperlinks": 0.030927835,
|
143 |
+
"ratio_extRedirection": 0,
|
144 |
+
"safe_anchor": 25,
|
145 |
+
"domain_registration_length": 3571,
|
146 |
"domain_age": 11039,
|
147 |
+
"web_traffic": 178542,
|
148 |
+
"google_index": 0,
|
149 |
"page_rank": 5,
|
150 |
},
|
151 |
]
|
152 |
|
153 |
+
# Converting data to a float32 NumPy array
|
154 |
+
df = pd.DataFrame(data).set_index("url")
|
155 |
+
inputs = df.to_numpy(dtype="float32")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
# Using the ONNX model to make predictions on the input data
|
158 |
probas = sess.run(None, {"X": inputs})[1]
|
159 |
|
|
|
160 |
# Displaying the results
|
161 |
+
for url, proba in zip(data, probas):
|
|
|
162 |
print(f"URL: {url['url']}")
|
163 |
print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
|
164 |
print("----")
|
165 |
|
166 |
+
# Output:
|
167 |
# URL: https://www.rga.com/about/workplace
|
168 |
# Likelihood of being a phishing site: 0.89%
|
169 |
# ----
|
170 |
|
171 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
@@ -14,7 +14,8 @@
|
|
14 |
"status"
|
15 |
],
|
16 |
"environment": [
|
17 |
-
"scikit-learn=1.3.2"
|
|
|
18 |
],
|
19 |
"example_input": {
|
20 |
"domain_age": [
|
|
|
14 |
"status"
|
15 |
],
|
16 |
"environment": [
|
17 |
+
"scikit-learn=1.3.2",
|
18 |
+
"joblib=1.3.2"
|
19 |
],
|
20 |
"example_input": {
|
21 |
"domain_age": [
|
model.onnx
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82ca6061ebf87588b3b77d17aabd252ca45759699f2e7fc1483d62edfc30a512
|
3 |
+
size 22231032
|
model.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e83b36152de04bb2f70a888ed8e09ad094d80375326033b60590fd5f3f78f6f
|
3 |
+
size 46069813
|