File size: 5,589 Bytes
3391ff3 4f2e11b 3391ff3 d235162 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
import joblib
import pandas as pd
from typing import Any, Dict, List
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import gradio as gr
# Constants for directories and file names
MODEL_DIR = 'models'
DATA_DIR = 'datasets'
DATA_FILE = 'cleaned_transaction_dataset.csv'
MODEL_NAMES = [
'LGBM Classifier',
'CatBoost Classifier',
'XGBoost Classifier',
]
# Load dataset
data_path = os.path.join(DATA_DIR, DATA_FILE)
df = pd.read_csv(data_path)
# Load models
def load_models(model_names: List[str]) -> Dict[str, Any]:
"""Load machine learning models from disk."""
models = {}
for name in model_names:
path = os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib")
try:
models[name] = joblib.load(path)
except Exception as e:
print(f"Error loading model {name}: {str(e)}")
return models
models = load_models(MODEL_NAMES)
# Prepare features and target
X = df.drop(columns=['FLAG'])
y = df['FLAG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
# Standardize the features
scaler = StandardScaler().fit(X_train)
# Prediction and metrics evaluation function
def calculate_metrics(y_true, y_pred, average_type='binary'):
"""Calculate and return accuracy, recall, F1, and precision scores."""
acc = accuracy_score(y_true, y_pred)
rec = recall_score(y_true, y_pred, average=average_type)
f1 = f1_score(y_true, y_pred, average=average_type)
prec = precision_score(y_true, y_pred, average=average_type)
return acc, rec, f1, prec
def load_and_predict(input_data):
try:
# Scale the input sample using the already-fitted scaler
sample_trans = scaler.transform(input_data)
# Using SMOTE to handle class imbalance
X_resampled, y_resampled = SMOTE(random_state=123).fit_resample(scaler.transform(X_train), y_train)
results = []
for name, model in models.items():
flag_pred = model.predict(sample_trans)
y_resampled_pred = model.predict(X_resampled)
acc, rec, f1, prec = calculate_metrics(y_resampled, y_resampled_pred)
results.append({
'Model': name,
'Predicted Fraud': 'Yes' if flag_pred[0] == 1 else 'No',
'Accuracy %': acc * 100,
'Recall %': rec * 100,
'F1 %': f1 * 100,
'Precision %': prec * 100
})
return pd.DataFrame(results).sort_values(by='Accuracy %', ascending=False)
except Exception as e:
return f"An error occurred during prediction: {str(e)}"
# Gradio interface
def predict(avg_min_sent, avg_min_received, time_diff, sent_tnx, received_tnx, num_created_contracts,
max_value_received, avg_value_received, avg_value_sent, total_sent,
total_balance, erc20_received, erc20_sent, erc20_sent_contract,
erc20_unique_sent, erc20_unique_received):
input_features = [
avg_min_sent,
avg_min_received,
time_diff,
sent_tnx,
received_tnx,
num_created_contracts,
max_value_received,
avg_value_received,
avg_value_sent,
total_sent,
total_balance,
erc20_received,
erc20_sent,
erc20_sent_contract,
erc20_unique_sent,
erc20_unique_received
]
input_data = pd.DataFrame([input_features])
results_df = load_and_predict(input_data)
return results_df
# Gradio inputs based on the features you have
inputs = [
gr.Number(label="Avg min between sent tnx", value=df["Avg min between sent tnx"].mean()),
gr.Number(label="Avg min between received tnx", value=df["Avg min between received tnx"].mean()),
gr.Number(label="Time difference between first and last (mins)", value=df["Time difference between first and last (mins)"].mean()),
gr.Number(label="Sent tnx", value=df["Sent tnx"].mean()),
gr.Number(label="Received tnx", value=df["Received tnx"].mean()),
gr.Number(label="Number of created contracts", value=int(df["Number of created contracts"].mean())),
gr.Number(label="Max value received", value=df["Max value received"].mean()),
gr.Number(label="Avg value received", value=df["Avg value received"].mean()),
gr.Number(label="Avg value sent", value=df["Avg value sent"].mean()),
gr.Number(label="Total either sent", value=df["Total either sent"].mean()),
gr.Number(label="Total either balance", value=df["Total either balance"].mean()),
gr.Number(label="ERC20 total either received", value=df["ERC20 total either received"].mean()),
gr.Number(label="ERC20 total either sent", value=df["ERC20 total either sent"].mean()),
gr.Number(label="ERC20 total either sent contract", value=df["ERC20 total either sent contract"].mean()),
gr.Number(label="ERC20 unique sent address", value=df["ERC20 unique sent address"].mean()),
gr.Number(label="ERC20 unique received token name", value=df["ERC20 unique received token name"].mean()),
]
output = gr.Dataframe(label="Prediction Results")
# Create the Gradio interface
gr.Interface(
fn=predict,
inputs=inputs,
outputs=output,
title="Fraud Detection Etherium Prediction App",
description="This application predicts fraud in Ethereum transactions using multiple machine learning models.",
theme="compact"
).launch() |