commit from github

Files changed (4) hide show

README.md +24 -137
config.json +2 -1
model.onnx +2 -2
model.pkl +2 -2

README.md CHANGED Viewed

@@ -5,6 +5,7 @@ tags:
 - tabular-classification
 - sklearn
 - phishing
 model_format: pickle
 model_file: model.pkl
 widget:
@@ -53,6 +54,7 @@ widget:
     - 178542.0
     - 0.0
     - 2.0
 pipeline_tag: tabular-classification
 ---
@@ -113,172 +115,57 @@ This is the architecture of the model loaded by joblib.
 Below are some code snippets to load the model.
-## With joblib (not recommended)
-```python
-import joblib
-import pandas as pd
-urls = [
-    {
-        "url": "https://www.rga.com/about/workplace",
-        "nb_hyperlinks": 97.0,
-        "ratio_intHyperlinks": 0.969072165,
-        "ratio_extHyperlinks": 0.030927835,
-        "ratio_extRedirection": 0.0,
-        "safe_anchor": 25.0,
-        "domain_registration_length": 3571.0,
-        "domain_age": 11039,
-        "web_traffic": 178542.0,
-        "google_index": 0.0,
-        "page_rank": 5,
-    },
-]
-model = joblib.load("models/model.pkl")
-df = pd.DataFrame(urls)
-df = df.set_index("url")
-probas = model.predict_proba(df.values)
-for url, proba in zip(urls, probas):
-    print(f"URL: {url['url']}")
-    print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
-    print("----")
-# output:
-# URL: https://www.rga.com/about/workplace
-# Likelihood of being a phishing site: 0.89%
-# ----
-```
 ## With ONNX (recommended)
 ### Python
 ```python
-import numpy as np
 import onnxruntime
 import pandas as pd
 # Defining a list of URLs with characteristics
-urls = [
     {
         "url": "https://www.rga.com/about/workplace",
-        "nb_hyperlinks": 97.0,
         "ratio_intHyperlinks": 0.969072165,
         "ratio_extHyperlinks": 0.030927835,
-        "ratio_extRedirection": 0.0,
-        "safe_anchor": 25.0,
-        "domain_registration_length": 3571.0,
         "domain_age": 11039,
-        "web_traffic": 178542.0,
-        "google_index": 0.0,
         "page_rank": 5,
     },
 ]
-# Initializing the ONNX Runtime session with the pre-trained model
-sess = onnxruntime.InferenceSession(
-    "models/model.onnx",
-    providers=["CPUExecutionProvider"],
-)
-# Creating a DataFrame from the list of URLs
-df = pd.DataFrame(urls)
-df = df.set_index("url")
-# Converting DataFrame data to a float32 NumPy array
-inputs = df.astype(np.float32).to_numpy()
 # Using the ONNX model to make predictions on the input data
 probas = sess.run(None, {"X": inputs})[1]
 # Displaying the results
-for url, proba in zip(urls, probas):
-    print(proba)
     print(f"URL: {url['url']}")
     print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
     print("----")
-# output:
 # URL: https://www.rga.com/about/workplace
 # Likelihood of being a phishing site: 0.89%
 # ----
 ```
-### JavaScript
-```javascript
-const ort = require('onnxruntime-node');
-const urls = [
-    {
-        "url": "http://rapidpaws.com/wp-content/we_transfer/index2.php?email=/",
-        "nb_hyperlinks": 1,
-        "ratio_intHyperlinks": 1,
-        "ratio_extHyperlinks": 0,
-        "ratio_extRedirection": 0,
-        "safe_anchor": 0,
-        "domain_registration_length": 338,
-        "domain_age": 0,
-        "web_traffic":1853,
-        "google_index": 1,
-        "page_rank": 2,
-    },
-];
-async function main() {
-    try {
-        // Creating an ONNX inference session with the specified model
-        const model_path = "./models/model.onnx";
-        const session = await ort.InferenceSession.create(model_path);
-        // Get values from data and remove url links
-        const inputs = urls.map(url => Object.values(url).slice(1));
-        // Flattening the 2D array to get a 1D array
-        const flattenInputs = inputs.flat();
-        // Creating an ONNX tensor from the input array
-        const tensor = new ort.Tensor('float32', flattenInputs, [inputs.length, 10]);
-        // Executing the inference session with the input tensor
-        const results = await session.run({"X": tensor});
-        // Retrieving probability data from the results
-        const probas = results['probabilities'].data;
-        // Displaying results for each URL
-        urls.forEach((url, index) => {
-            // The index * 2 + 1 is used to access the probability associated with the phishing class
-            const proba = probas[index * 2 + 1];
-            const percent = (proba * 100).toFixed(2);
-            console.log(`URL: ${url.url}`);
-            console.log(`Likelihood of being a phishing site: ${percent}%`);
-            console.log("----");
-        });
-    } catch (e) {
-        console.log(`failed to inference ONNX model: ${e}.`);
-    }
-};
-main();
-// output:
-// URL: https://www.rga.com/about/workplace
-// Likelihood of being a phishing site: 0.89%
-// ----
-```

 - tabular-classification
 - sklearn
 - phishing
+- onnx
 model_format: pickle
 model_file: model.pkl
 widget:
     - 178542.0
     - 0.0
     - 2.0
+inference: false
 pipeline_tag: tabular-classification
 ---
 Below are some code snippets to load the model.
 ## With ONNX (recommended)
 ### Python
 ```python
 import onnxruntime
 import pandas as pd
+from huggingface_hub import hf_hub_download
+REPO_ID = "pirocheto/phishing-url-detection"
+FILENAME = "model.onnx"
+# Initializing the ONNX Runtime session with the pre-trained model
+sess = onnxruntime.InferenceSession(
+    hf_hub_download(repo_id=REPO_ID, filename=FILENAME),
+    providers=["CPUExecutionProvider"],
+)
 # Defining a list of URLs with characteristics
+data = [
     {
         "url": "https://www.rga.com/about/workplace",
+        "nb_hyperlinks": 97,
         "ratio_intHyperlinks": 0.969072165,
         "ratio_extHyperlinks": 0.030927835,
+        "ratio_extRedirection": 0,
+        "safe_anchor": 25,
+        "domain_registration_length": 3571,
         "domain_age": 11039,
+        "web_traffic": 178542,
+        "google_index": 0,
         "page_rank": 5,
     },
 ]
+# Converting data to a float32 NumPy array
+df = pd.DataFrame(data).set_index("url")
+inputs = df.to_numpy(dtype="float32")
 # Using the ONNX model to make predictions on the input data
 probas = sess.run(None, {"X": inputs})[1]
 # Displaying the results
+for url, proba in zip(data, probas):
     print(f"URL: {url['url']}")
     print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
     print("----")
+# Output:
 # URL: https://www.rga.com/about/workplace
 # Likelihood of being a phishing site: 0.89%
 # ----
 ```

config.json CHANGED Viewed

@@ -14,7 +14,8 @@
             "status"
         ],
         "environment": [
-            "scikit-learn=1.3.2"
         ],
         "example_input": {
             "domain_age": [

             "status"
         ],
         "environment": [
+            "scikit-learn=1.3.2",
+            "joblib=1.3.2"
         ],
         "example_input": {
             "domain_age": [

model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:752dcb545a8bd4c2f5274351381f40e30e5152d68c51cfd3b5a80627ec3ec660
-size 22151408

 version https://git-lfs.github.com/spec/v1
+oid sha256:82ca6061ebf87588b3b77d17aabd252ca45759699f2e7fc1483d62edfc30a512
+size 22231032

model.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b4b1ddc24febf9cfbb0ff4a289d5d5620b3922f68a5e6fe978a860de6e51c81
-size 45906459

 version https://git-lfs.github.com/spec/v1
+oid sha256:1e83b36152de04bb2f70a888ed8e09ad094d80375326033b60590fd5f3f78f6f
+size 46069813