metadata
license: mit
library_name: sklearn
tags:
- classification
- phishing
Model description
Training Procedure
Hyperparameters
Click to expand
Hyperparameter | Value |
---|---|
memory | |
steps | [('standardscaler', StandardScaler()), ('calibratedclassifiercv', CalibratedClassifierCV(cv=5, estimator=RandomForestClassifier(), method='isotonic'))] |
verbose | False |
standardscaler | StandardScaler() |
calibratedclassifiercv | CalibratedClassifierCV(cv=5, estimator=RandomForestClassifier(), method='isotonic') |
standardscaler__copy | True |
standardscaler__with_mean | True |
standardscaler__with_std | True |
calibratedclassifiercv__base_estimator | deprecated |
calibratedclassifiercv__cv | 5 |
calibratedclassifiercv__ensemble | True |
calibratedclassifiercv__estimator__bootstrap | True |
calibratedclassifiercv__estimator__ccp_alpha | 0.0 |
calibratedclassifiercv__estimator__class_weight | |
calibratedclassifiercv__estimator__criterion | gini |
calibratedclassifiercv__estimator__max_depth | |
calibratedclassifiercv__estimator__max_features | sqrt |
calibratedclassifiercv__estimator__max_leaf_nodes | |
calibratedclassifiercv__estimator__max_samples | |
calibratedclassifiercv__estimator__min_impurity_decrease | 0.0 |
calibratedclassifiercv__estimator__min_samples_leaf | 1 |
calibratedclassifiercv__estimator__min_samples_split | 2 |
calibratedclassifiercv__estimator__min_weight_fraction_leaf | 0.0 |
calibratedclassifiercv__estimator__n_estimators | 100 |
calibratedclassifiercv__estimator__n_jobs | |
calibratedclassifiercv__estimator__oob_score | False |
calibratedclassifiercv__estimator__random_state | |
calibratedclassifiercv__estimator__verbose | 0 |
calibratedclassifiercv__estimator__warm_start | False |
calibratedclassifiercv__estimator | RandomForestClassifier() |
calibratedclassifiercv__method | isotonic |
calibratedclassifiercv__n_jobs |
Model Plot
This is the architecture of the model loaded by joblib.
Pipeline(steps=[('standardscaler', StandardScaler()),('calibratedclassifiercv',CalibratedClassifierCV(cv=5,estimator=RandomForestClassifier(),method='isotonic'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('standardscaler', StandardScaler()),('calibratedclassifiercv',CalibratedClassifierCV(cv=5,estimator=RandomForestClassifier(),method='isotonic'))])
StandardScaler()
CalibratedClassifierCV(cv=5, estimator=RandomForestClassifier(),method='isotonic')
RandomForestClassifier()
RandomForestClassifier()
Evaluation Results
Metric | Value |
---|---|
accuracy | 0.945652 |
f1-score | 0.945114 |
precision | 0.951996 |
recall | 0.938331 |
Test Report
Model Interpretation
Feature Importances
How to Get Started with the Model
Below are some code snippets to load the model.
With joblib (not recommended)
import joblib
import pandas as pd
urls = [
{
"url": "https://www.rga.com/about/workplace",
"nb_hyperlinks": 97.0,
"ratio_intHyperlinks": 0.969072165,
"ratio_extHyperlinks": 0.030927835,
"ratio_extRedirection": 0.0,
"safe_anchor": 25.0,
"domain_registration_length": 3571.0,
"domain_age": 11039,
"web_traffic": 178542.0,
"google_index": 0.0,
"page_rank": 5,
},
]
model = joblib.load("models/model.pkl")
df = pd.DataFrame(urls)
df = df.set_index("url")
probas = model.predict_proba(df.values)
for url, proba in zip(urls, probas):
print(f"URL: {url['url']}")
print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
print("----")
# output:
# URL: https://www.rga.com/about/workplace
# Likelihood of being a phishing site: 0.89%
# ----
With ONNX (recommended)
Python
import numpy as np
import onnxruntime
import pandas as pd
# Defining a list of URLs with characteristics
urls = [
{
"url": "https://www.rga.com/about/workplace",
"nb_hyperlinks": 97.0,
"ratio_intHyperlinks": 0.969072165,
"ratio_extHyperlinks": 0.030927835,
"ratio_extRedirection": 0.0,
"safe_anchor": 25.0,
"domain_registration_length": 3571.0,
"domain_age": 11039,
"web_traffic": 178542.0,
"google_index": 0.0,
"page_rank": 5,
},
]
# Initializing the ONNX Runtime session with the pre-trained model
sess = onnxruntime.InferenceSession(
"models/model.onnx",
providers=["CPUExecutionProvider"],
)
# Creating a DataFrame from the list of URLs
df = pd.DataFrame(urls)
df = df.set_index("url")
# Converting DataFrame data to a float32 NumPy array
inputs = df.astype(np.float32).to_numpy()
# Using the ONNX model to make predictions on the input data
probas = sess.run(None, {"X": inputs})[1]
# Displaying the results
for url, proba in zip(urls, probas):
print(proba)
print(f"URL: {url['url']}")
print(f"Likelihood of being a phishing site: {proba[1] * 100:.2f}%")
print("----")
# output:
# URL: https://www.rga.com/about/workplace
# Likelihood of being a phishing site: 0.89%
# ----
JavaScript
const ort = require('onnxruntime-node');
const urls = [
{
"url": "http://rapidpaws.com/wp-content/we_transfer/index2.php?email=/",
"nb_hyperlinks": 1,
"ratio_intHyperlinks": 1,
"ratio_extHyperlinks": 0,
"ratio_extRedirection": 0,
"safe_anchor": 0,
"domain_registration_length": 338,
"domain_age": 0,
"web_traffic":1853,
"google_index": 1,
"page_rank": 2,
},
];
async function main() {
try {
// Creating an ONNX inference session with the specified model
const model_path = "./models/model.onnx";
const session = await ort.InferenceSession.create(model_path);
// Get values from data and remove url links
const inputs = urls.map(url => Object.values(url).slice(1));
// Flattening the 2D array to get a 1D array
const flattenInputs = inputs.flat();
// Creating an ONNX tensor from the input array
const tensor = new ort.Tensor('float32', flattenInputs, [inputs.length, 10]);
// Executing the inference session with the input tensor
const results = await session.run({"X": tensor});
// Retrieving probability data from the results
const probas = results['probabilities'].data;
// Displaying results for each URL
urls.forEach((url, index) => {
// The index * 2 + 1 is used to access the probability associated with the phishing class
const proba = probas[index * 2 + 1];
const percent = (proba * 100).toFixed(2);
console.log(`URL: ${url.url}`);
console.log(`Likelihood of being a phishing site: ${percent}%`);
console.log("----");
});
} catch (e) {
console.log(`failed to inference ONNX model: ${e}.`);
}
};
main();
// output:
// URL: https://www.rga.com/about/workplace
// Likelihood of being a phishing site: 0.89%
// ----