GitHub Action commited on
Commit
672fe2d
·
1 Parent(s): 0652382

commit from github

Browse files
Files changed (4) hide show
  1. README.md +24 -137
  2. config.json +2 -1
  3. model.onnx +2 -2
  4. model.pkl +2 -2
README.md CHANGED
@@ -5,6 +5,7 @@ tags:
5
  - tabular-classification
6
  - sklearn
7
  - phishing
 
8
  model_format: pickle
9
  model_file: model.pkl
10
  widget:
@@ -53,6 +54,7 @@ widget:
53
  - 178542.0
54
  - 0.0
55
  - 2.0
 
56
  pipeline_tag: tabular-classification
57
  ---
58
 
@@ -113,172 +115,57 @@ This is the architecture of the model loaded by joblib.
113
 
114
  Below are some code snippets to load the model.
115
 
116
- ## With joblib (not recommended)
117
-
118
- ```python
119
- import joblib
120
- import pandas as pd
121
-
122
- urls = [
123
- {
124
- "url": "https://www.rga.com/about/workplace",
125
- "nb_hyperlinks": 97.0,
126
- "ratio_intHyperlinks": 0.969072165,
127
- "ratio_extHyperlinks": 0.030927835,
128
- "ratio_extRedirection": 0.0,
129
- "safe_anchor": 25.0,
130
- "domain_registration_length": 3571.0,
131
- "domain_age": 11039,
132
- "web_traffic": 178542.0,
133
- "google_index": 0.0,
134
- "page_rank": 5,
135
- },
136
- ]
137
-
138
-
139
- model = joblib.load("models/model.pkl")
140
-
141
- df = pd.DataFrame(urls)
142
- df = df.set_index("url")
143
-
144
- probas = model.predict_proba(df.values)
145
-
146
- for url, proba in zip(urls, probas):
147
- print(f"URL: {url['url']}")
148
- print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
149
- print("----")
150
-
151
-
152
- # output:
153
- # URL: https://www.rga.com/about/workplace
154
- # Likelihood of being a phishing site: 0.89%
155
- # ----
156
-
157
- ```
158
-
159
  ## With ONNX (recommended)
160
 
161
  ### Python
162
 
163
  ```python
164
- import numpy as np
165
  import onnxruntime
166
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
167
 
168
  # Defining a list of URLs with characteristics
169
- urls = [
170
  {
171
  "url": "https://www.rga.com/about/workplace",
172
- "nb_hyperlinks": 97.0,
173
  "ratio_intHyperlinks": 0.969072165,
174
  "ratio_extHyperlinks": 0.030927835,
175
- "ratio_extRedirection": 0.0,
176
- "safe_anchor": 25.0,
177
- "domain_registration_length": 3571.0,
178
  "domain_age": 11039,
179
- "web_traffic": 178542.0,
180
- "google_index": 0.0,
181
  "page_rank": 5,
182
  },
183
  ]
184
 
185
- # Initializing the ONNX Runtime session with the pre-trained model
186
- sess = onnxruntime.InferenceSession(
187
- "models/model.onnx",
188
- providers=["CPUExecutionProvider"],
189
- )
190
-
191
- # Creating a DataFrame from the list of URLs
192
- df = pd.DataFrame(urls)
193
- df = df.set_index("url")
194
-
195
- # Converting DataFrame data to a float32 NumPy array
196
- inputs = df.astype(np.float32).to_numpy()
197
-
198
 
199
  # Using the ONNX model to make predictions on the input data
200
  probas = sess.run(None, {"X": inputs})[1]
201
 
202
-
203
  # Displaying the results
204
- for url, proba in zip(urls, probas):
205
- print(proba)
206
  print(f"URL: {url['url']}")
207
  print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
208
  print("----")
209
 
210
- # output:
211
  # URL: https://www.rga.com/about/workplace
212
  # Likelihood of being a phishing site: 0.89%
213
  # ----
214
 
215
  ```
216
-
217
- ### JavaScript
218
-
219
- ```javascript
220
- const ort = require('onnxruntime-node');
221
-
222
- const urls = [
223
- {
224
- "url": "http://rapidpaws.com/wp-content/we_transfer/index2.php?email=/",
225
- "nb_hyperlinks": 1,
226
- "ratio_intHyperlinks": 1,
227
- "ratio_extHyperlinks": 0,
228
- "ratio_extRedirection": 0,
229
- "safe_anchor": 0,
230
- "domain_registration_length": 338,
231
- "domain_age": 0,
232
- "web_traffic":1853,
233
- "google_index": 1,
234
- "page_rank": 2,
235
- },
236
- ];
237
-
238
- async function main() {
239
- try {
240
-
241
- // Creating an ONNX inference session with the specified model
242
- const model_path = "./models/model.onnx";
243
- const session = await ort.InferenceSession.create(model_path);
244
-
245
- // Get values from data and remove url links
246
- const inputs = urls.map(url => Object.values(url).slice(1));
247
-
248
- // Flattening the 2D array to get a 1D array
249
- const flattenInputs = inputs.flat();
250
-
251
- // Creating an ONNX tensor from the input array
252
- const tensor = new ort.Tensor('float32', flattenInputs, [inputs.length, 10]);
253
-
254
- // Executing the inference session with the input tensor
255
- const results = await session.run({"X": tensor});
256
-
257
- // Retrieving probability data from the results
258
- const probas = results['probabilities'].data;
259
-
260
- // Displaying results for each URL
261
- urls.forEach((url, index) => {
262
- // The index * 2 + 1 is used to access the probability associated with the phishing class
263
- const proba = probas[index * 2 + 1];
264
- const percent = (proba * 100).toFixed(2);
265
-
266
- console.log(`URL: ${url.url}`);
267
- console.log(`Likelihood of being a phishing site: ${percent}%`);
268
- console.log("----");
269
- });
270
-
271
-
272
- } catch (e) {
273
- console.log(`failed to inference ONNX model: ${e}.`);
274
- }
275
- };
276
-
277
- main();
278
-
279
- // output:
280
- // URL: https://www.rga.com/about/workplace
281
- // Likelihood of being a phishing site: 0.89%
282
- // ----
283
-
284
- ```
 
5
  - tabular-classification
6
  - sklearn
7
  - phishing
8
+ - onnx
9
  model_format: pickle
10
  model_file: model.pkl
11
  widget:
 
54
  - 178542.0
55
  - 0.0
56
  - 2.0
57
+ inference: false
58
  pipeline_tag: tabular-classification
59
  ---
60
 
 
115
 
116
  Below are some code snippets to load the model.
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  ## With ONNX (recommended)
119
 
120
  ### Python
121
 
122
  ```python
 
123
  import onnxruntime
124
  import pandas as pd
125
+ from huggingface_hub import hf_hub_download
126
+
127
+ REPO_ID = "pirocheto/phishing-url-detection"
128
+ FILENAME = "model.onnx"
129
+
130
+ # Initializing the ONNX Runtime session with the pre-trained model
131
+ sess = onnxruntime.InferenceSession(
132
+ hf_hub_download(repo_id=REPO_ID, filename=FILENAME),
133
+ providers=["CPUExecutionProvider"],
134
+ )
135
 
136
  # Defining a list of URLs with characteristics
137
+ data = [
138
  {
139
  "url": "https://www.rga.com/about/workplace",
140
+ "nb_hyperlinks": 97,
141
  "ratio_intHyperlinks": 0.969072165,
142
  "ratio_extHyperlinks": 0.030927835,
143
+ "ratio_extRedirection": 0,
144
+ "safe_anchor": 25,
145
+ "domain_registration_length": 3571,
146
  "domain_age": 11039,
147
+ "web_traffic": 178542,
148
+ "google_index": 0,
149
  "page_rank": 5,
150
  },
151
  ]
152
 
153
+ # Converting data to a float32 NumPy array
154
+ df = pd.DataFrame(data).set_index("url")
155
+ inputs = df.to_numpy(dtype="float32")
 
 
 
 
 
 
 
 
 
 
156
 
157
  # Using the ONNX model to make predictions on the input data
158
  probas = sess.run(None, {"X": inputs})[1]
159
 
 
160
  # Displaying the results
161
+ for url, proba in zip(data, probas):
 
162
  print(f"URL: {url['url']}")
163
  print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
164
  print("----")
165
 
166
+ # Output:
167
  # URL: https://www.rga.com/about/workplace
168
  # Likelihood of being a phishing site: 0.89%
169
  # ----
170
 
171
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -14,7 +14,8 @@
14
  "status"
15
  ],
16
  "environment": [
17
- "scikit-learn=1.3.2"
 
18
  ],
19
  "example_input": {
20
  "domain_age": [
 
14
  "status"
15
  ],
16
  "environment": [
17
+ "scikit-learn=1.3.2",
18
+ "joblib=1.3.2"
19
  ],
20
  "example_input": {
21
  "domain_age": [
model.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:752dcb545a8bd4c2f5274351381f40e30e5152d68c51cfd3b5a80627ec3ec660
3
- size 22151408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ca6061ebf87588b3b77d17aabd252ca45759699f2e7fc1483d62edfc30a512
3
+ size 22231032
model.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b4b1ddc24febf9cfbb0ff4a289d5d5620b3922f68a5e6fe978a860de6e51c81
3
- size 45906459
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e83b36152de04bb2f70a888ed8e09ad094d80375326033b60590fd5f3f78f6f
3
+ size 46069813