import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns data= pd.read_csv("creditcard.csv") data data.info() data.isnull().sum() # Check for NaN values in the DataFrame nan_values = data.isna().sum() nan_values # Drop Nan values data.dropna(inplace=True) data["Class"].value_counts() # Class Distribution plt.figure(figsize=(8, 6)) sns.countplot(x='Class', data=data, palette='viridis') plt.xlabel('Class (0: Non-Fraud, 1: Fraud)') plt.ylabel('Count') plt.title('Class Distribution (Fraud vs. Non-Fraud)') plt.show() # Transaction Amount Distribution for Fraudulent and Non-Fraudulent Transactions plt.figure(figsize=(8, 6)) sns.boxplot(x='Class', y='Amount', data=data) plt.xlabel('Class (0: Non-Fraud, 1: Fraud)') plt.ylabel('Transaction Amount') plt.title('Transaction Amount Distribution by Class') plt.show() # Correlation Matrix Heatmap correlation_matrix = data.corr() plt.figure(figsize=(12, 10)) sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f") plt.title('Correlation Matrix Heatmap') plt.show() # Histograms of Features for Fraudulent and Non-Fraudulent Transactions features_to_plot = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10'] for feature in features_to_plot: plt.figure(figsize=(8, 6)) sns.histplot(data[data['Class'] == 0][feature], label='Non-Fraud', kde=True) sns.histplot(data[data['Class'] == 1][feature], label='Fraud', kde=True) plt.title(f'Distribution of {feature} by Class') plt.legend() plt.show() # Combined visualization for Amount and Time plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) sns.histplot(data['Amount'], bins=50, kde=True) plt.xlabel('Transaction Amount') plt.ylabel('Frequency') plt.title('Distribution of Transaction Amount') plt.subplot(1, 2, 2) sns.histplot(data['Time'], bins=50, kde=True) plt.xlabel('Time (seconds from the first transaction)') plt.ylabel('Frequency') plt.title('Distribution of Transaction Time') plt.tight_layout() plt.show() # Transaction amount vs. fraud sns.boxplot(x='Class', y='Amount', data=data) plt.title('Transaction Amount vs. Class') plt.show() # Separate features (X) and target variable (y) X = data.drop('Class', axis=1) y = data['Class'] # # Apply SMOTE to oversample the minority class # from imblearn.over_sampling import SMOTE # smote = SMOTE(random_state=42) # X_resampled, y_resampled = smote.fit_resample(X, y) # # Print the class distribution after SMOTE # print(pd.Series(y_resampled).value_counts()) # Split the data into training and testing sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Analyze the results from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score import matplotlib.pyplot as plt import seaborn as sns from sklearn import metrics def analyze_results(y_test,y_pred): # Get the metrics accuracy = metrics.accuracy_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred, average='weighted') recall = metrics.recall_score(y_test, y_pred, average='weighted') f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # print metrics print("Accuracy:", "%.6f" % accuracy) print("Precision:", "%.6f" % precision) print("Recall:", "%.6f" % recall) print("F1 Score:", "%.6f" % f1_score) print("Classification Report:") print(metrics.classification_report(y_test, y_pred)) # Produce a confusion matrix cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Neutral', 'Positive', 'Negative'], yticklabels=['Neutral', 'Positive', 'Negative']) plt.xlabel('Predicted Labels') plt.ylabel('True Labels') plt.title('Confusion Matrix') plt.show() return accuracy, precision, recall, f1_score from xgboost import XGBClassifier # Create an XGBoost classifier xgb_model = XGBClassifier(random_state=42) # Train the model xgb_model.fit(X_train, y_train) # Make predictions on the test set y_pred_xgb = xgb_model.predict(X_test) # Analyze the results analyze_results(y_test, y_pred_xgb) from sklearn.ensemble import RandomForestClassifier # Create a Random Forest classifier rf_model = RandomForestClassifier(random_state=42) # Train the model rf_model.fit(X_train, y_train) # Make predictions on the test set y_pred_rf = rf_model.predict(X_test) # Analyze the results analyze_results(y_test, y_pred_rf) from sklearn.ensemble import GradientBoostingClassifier # Create a Gradient Boosting Machine classifier gbm_model = GradientBoostingClassifier(random_state=42) # Train the model gbm_model.fit(X_train, y_train) # Make predictions on the test set y_pred_gbm = gbm_model.predict(X_test) # Analyze the results analyze_results(y_test, y_pred_gbm) import matplotlib.pyplot as plt model_names = ['XGBoost', 'Random Forest', 'Gradient Boosting'] accuracies = [0.999145, 0.999561, 0.998947] plt.figure(figsize=(10, 6)) sns.barplot(x=model_names, y=accuracies, palette='viridis') plt.xticks(rotation=45) plt.ylim(0.95, 1.001) plt.xlabel('Model') plt.ylabel('Accuracy') plt.title('Comparison of Model Accuracies') plt.show() import gradio as gr # Assuming you have defined the three models elsewhere in your code: xgb_model, rf_model, gb_model # Make sure you import or define these models before using them in the Gradio interface # Prediction function using the three models def predict_fraud(*features): # Convert the input features into a list input_data = [features] # Get predictions from each of the three models xgb_pred = xgb_model.predict(input_data)[0] rf_pred = rf_model.predict(input_data)[0] gb_pred = gbm_model.predict(input_data)[0] # Return predictions return ( "Fraud" if xgb_pred == 1 else "Not Fraud", "Fraud" if rf_pred == 1 else "Not Fraud", "Fraud" if gb_pred == 1 else "Not Fraud" ) # Define input labels input_labels = [ "Time", "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "Amount" ] # Create Gradio interface using the updated components syntax gradio_interface = gr.Interface( fn=predict_fraud, inputs=[gr.Number(label=label) for label in input_labels], outputs=[ gr.Textbox(label="XGBoost Prediction"), gr.Textbox(label="Random Forest Prediction"), gr.Textbox(label="Gradient Boosting Prediction") ], title="Credit Card Fraud Detection with XGBoost, Random Forest, and Gradient Boosting", description="Enter transaction details to get predictions from three different models." ) # Launch the Gradio app gradio_interface.launch(share=True)