| """ |
| Financial Fraud Detection System - TechMatrix Solvers |
| Team Members: |
| - Abhay Gupta |
| - Jay Kumar |
| - Kripanshu Gupta |
| - Bhumika Patel |
| |
| A comprehensive fraud detection system using machine learning algorithms. |
| """ |
|
|
| import streamlit as st |
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import plotly.express as px |
| import plotly.graph_objects as go |
| import os |
| import pickle |
| import time |
| import warnings |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.model_selection import train_test_split |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.ensemble import RandomForestClassifier |
| from xgboost import XGBClassifier |
| from sklearn.metrics import ( |
| accuracy_score, precision_score, recall_score, f1_score, |
| roc_auc_score, confusion_matrix, classification_report, roc_curve |
| ) |
| from imblearn.over_sampling import SMOTE |
|
|
| |
| warnings.filterwarnings('ignore') |
|
|
| |
| st.set_page_config( |
| page_title="TechMatrix Fraud Detection System", |
| page_icon="🔒", |
| layout="wide", |
| initial_sidebar_state="collapsed" |
| ) |
|
|
| |
| st.markdown(""" |
| <style> |
| /* Main theme colors */ |
| :root { |
| --primary: #2E7D32; |
| --primary-light: #81C784; |
| --primary-dark: #1B5E20; |
| --secondary: #1976D2; |
| --secondary-light: #64B5F6; |
| --text-on-primary: #FFFFFF; |
| --text-primary: #212121; |
| --text-secondary: #757575; |
| --background: #F5F5F5; |
| --card-bg: #FFFFFF; |
| --success: #43A047; |
| --warning: #FFA000; |
| --error: #D32F2F; |
| --info: #1976D2; |
| } |
| |
| /* Base styles */ |
| .main-header { |
| font-size: 2.8rem; |
| color: var(--primary); |
| text-align: center; |
| margin-bottom: 1.5rem; |
| font-weight: 700; |
| background: linear-gradient(90deg, var(--primary), var(--secondary)); |
| -webkit-background-clip: text; |
| -webkit-text-fill-color: transparent; |
| padding: 0.5rem 0; |
| } |
| |
| .sub-header { |
| font-size: 2rem; |
| color: var(--primary-dark); |
| margin-top: 2rem; |
| margin-bottom: 1rem; |
| font-weight: 600; |
| border-bottom: 2px solid var(--primary-light); |
| padding-bottom: 0.5rem; |
| } |
| |
| .metric-card { |
| text-align: center; |
| padding: 1.2rem; |
| border-radius: 0.8rem; |
| background-color: rgba(46, 125, 50, 0.1); |
| transition: transform 0.3s ease; |
| border-left: 4px solid var(--primary); |
| } |
| |
| .metric-card:hover { |
| transform: translateY(-5px); |
| background-color: rgba(46, 125, 50, 0.15); |
| } |
| |
| .metric-value { |
| font-size: 2.5rem; |
| font-weight: 700; |
| color: var(--primary); |
| margin: 0.5rem 0; |
| } |
| |
| .metric-label { |
| font-size: 1rem; |
| color: var(--text-secondary); |
| margin-bottom: 0.5rem; |
| } |
| |
| div[data-testid="stMetric"] { |
| background-color: rgba(46, 125, 50, 0.1); |
| padding: 1rem; |
| border-radius: 0.8rem; |
| border-left: 4px solid var(--primary); |
| transition: transform 0.3s ease; |
| } |
| |
| div[data-testid="stMetric"]:hover { |
| transform: translateY(-5px); |
| background-color: rgba(46, 125, 50, 0.15); |
| } |
| |
| div[data-testid="stMetric"] > div { |
| gap: 0.2rem; |
| } |
| |
| div[data-testid="stMetric"] label { |
| color: var(--text-secondary) !important; |
| } |
| |
| div[data-testid="stMetric"] .css-1wivap2 { |
| color: var(--primary) !important; |
| } |
| |
| .stButton > button { |
| background-color: var(--primary); |
| color: var(--text-on-primary); |
| border-radius: 0.5rem; |
| padding: 0.5rem 1rem; |
| font-weight: 600; |
| border: none; |
| transition: all 0.3s ease; |
| } |
| |
| .stButton > button:hover { |
| background-color: var(--primary-dark); |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); |
| transform: translateY(-2px); |
| } |
| |
| .stProgress > div > div > div { |
| background-color: var(--primary); |
| background-image: linear-gradient(45deg, |
| rgba(255,255,255,.15) 25%, |
| transparent 25%, |
| transparent 50%, |
| rgba(255,255,255,.15) 50%, |
| rgba(255,255,255,.15) 75%, |
| transparent 75%, |
| transparent |
| ); |
| background-size: 1rem 1rem; |
| animation: progress-animation 1s linear infinite; |
| } |
| |
| @keyframes progress-animation { |
| 0% { background-position: 0 0; } |
| 100% { background-position: 1rem 0; } |
| } |
| |
| .success-text { |
| color: var(--success); |
| font-weight: bold; |
| } |
| |
| .warning-text { |
| color: var(--warning); |
| font-weight: bold; |
| } |
| |
| .error-text { |
| color: var(--error); |
| font-weight: bold; |
| } |
| |
| .info-text { |
| color: var(--info); |
| font-weight: bold; |
| } |
| |
| @keyframes fadeIn { |
| from { opacity: 0; } |
| to { opacity: 1; } |
| } |
| |
| .animate-fade-in { |
| animation: fadeIn 0.8s ease-in-out; |
| } |
| |
| [data-testid="stSidebarNav"] ul li:nth-child(2) { |
| display: none; |
| } |
| |
| .dataframe { |
| border-collapse: collapse; |
| border: none; |
| font-size: 0.9rem; |
| } |
| |
| .dataframe th { |
| background-color: var(--primary-light); |
| color: var(--text-primary); |
| padding: 0.5rem; |
| text-align: left; |
| } |
| |
| .dataframe td { |
| padding: 0.5rem; |
| border-bottom: 1px solid #eee; |
| } |
| |
| .dataframe tr:hover { |
| background-color: #f5f5f5; |
| } |
| |
| .stSlider > div > div { |
| background-color: var(--primary-light); |
| } |
| |
| .stSelectbox > div > div { |
| background-color: var(--card-bg); |
| border-radius: 0.5rem; |
| border: 1px solid var(--primary-light); |
| } |
| |
| @keyframes pulse { |
| 0% { opacity: 0.6; } |
| 50% { opacity: 1; } |
| 100% { opacity: 0.6; } |
| } |
| |
| .loading-pulse { |
| animation: pulse 1.5s infinite ease-in-out; |
| } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| |
| os.makedirs("data", exist_ok=True) |
| os.makedirs("models", exist_ok=True) |
|
|
| |
| if 'current_page' not in st.session_state: |
| st.session_state['current_page'] = 'home' |
|
|
| if 'data' not in st.session_state: |
| st.session_state['data'] = None |
|
|
| if 'preprocessed_data' not in st.session_state: |
| st.session_state['preprocessed_data'] = None |
|
|
| if 'engineered_data' not in st.session_state: |
| st.session_state['engineered_data'] = None |
|
|
| if 'target_col' not in st.session_state: |
| st.session_state['target_col'] = 'Class' |
|
|
| if 'trained_models' not in st.session_state: |
| st.session_state['trained_models'] = {} |
|
|
| if 'predictions' not in st.session_state: |
| st.session_state['predictions'] = None |
|
|
| if 'progress' not in st.session_state: |
| st.session_state['progress'] = 0 |
|
|
| |
| st.markdown("<div class='animate-fade-in'><h1 class='main-header'>TechMatrix Fraud Detection System</h1></div>", unsafe_allow_html=True) |
|
|
| |
| st.markdown(""" |
| <div style='text-align: center; margin-bottom: 2rem;'> |
| <h3>Team TechMatrix Solvers</h3> |
| <p>Abhay Gupta | Jay Kumar | Kripanshu Gupta | Bhumika Patel</p> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| if st.session_state['current_page'] == 'home': |
| |
| st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Welcome to TechMatrix Fraud Detection System</h2></div>", unsafe_allow_html=True) |
| |
| col1, col2 = st.columns([2, 1]) |
| |
| with col1: |
| st.markdown(""" |
| Our advanced fraud detection system leverages cutting-edge machine learning algorithms to identify and prevent fraudulent transactions in real-time. |
| |
| ### Understanding Financial Fraud |
| |
| Financial fraud encompasses various deceptive practices aimed at unauthorized acquisition of funds or assets. |
| Our system specifically addresses: |
| - Credit card transaction fraud |
| - Identity theft incidents |
| - Account compromise attempts |
| - Suspicious transaction patterns |
| |
| ### Machine Learning Implementation |
| |
| Our system employs sophisticated machine learning models that analyze transaction patterns and behavioral data. |
| The models are trained on historical fraud data and continuously updated to adapt to emerging fraud patterns. |
| |
| ### System Advantages: |
| - **Real-time Monitoring**: Instant detection of suspicious activities |
| - **Scalable Processing**: Efficient handling of large transaction volumes |
| - **Pattern Recognition**: Advanced detection of complex fraud patterns |
| - **Risk Assessment**: Probability-based fraud scoring system |
| """) |
| |
| with col2: |
| |
| fig = go.Figure() |
| |
| |
| angles = np.linspace(0, 2*np.pi, 6, endpoint=False) |
| x = 0.5 + 0.4 * np.cos(angles) |
| y = 0.5 + 0.4 * np.sin(angles) |
| |
| |
| for i in range(len(angles)): |
| next_i = (i + 1) % len(angles) |
| fig.add_trace(go.Scatter( |
| x=[x[i], x[next_i]], |
| y=[y[i], y[next_i]], |
| mode='lines', |
| line=dict( |
| color='rgba(46, 125, 50, 0.5)', |
| width=2, |
| dash='dot' |
| ), |
| showlegend=False |
| )) |
| |
| |
| node_labels = ['Input Data', 'Validation', 'Processing', 'Analysis', 'Detection', 'Action'] |
| node_colors = ['#2E7D32', '#43A047', '#81C784', '#1976D2', '#64B5F6', '#D32F2F'] |
| |
| for i in range(len(angles)): |
| fig.add_trace(go.Scatter( |
| x=[x[i]], |
| y=[y[i]], |
| mode='markers+text', |
| marker=dict( |
| size=30, |
| color=node_colors[i], |
| symbol='hexagon' |
| ), |
| text=node_labels[i], |
| textposition="middle center", |
| textfont=dict(color='white', size=12), |
| showlegend=False |
| )) |
| |
| |
| fig.add_trace(go.Scatter( |
| x=[0.5], |
| y=[0.5], |
| mode='text', |
| text='Fraud<br>Detection<br>Pipeline', |
| textposition="middle center", |
| textfont=dict( |
| color='#212121', |
| size=14, |
| family='Arial, bold' |
| ), |
| showlegend=False |
| )) |
| |
| fig.update_layout( |
| height=400, |
| width=400, |
| margin=dict(l=0, r=0, t=0, b=0), |
| xaxis=dict( |
| showgrid=False, |
| zeroline=False, |
| showticklabels=False, |
| range=[0, 1] |
| ), |
| yaxis=dict( |
| showgrid=False, |
| zeroline=False, |
| showticklabels=False, |
| range=[0, 1] |
| ), |
| plot_bgcolor='rgba(0,0,0,0)' |
| ) |
| |
| st.plotly_chart(fig) |
| |
| |
| st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>System Workflow</h2></div>", unsafe_allow_html=True) |
|
|
| col1, col2, col3, col4 = st.columns(4) |
|
|
| with col1: |
| st.markdown("### 1. Data Ingestion") |
| st.markdown("Secure upload and validation of transaction data in CSV format.") |
| st.image("https://cdn-icons-png.flaticon.com/512/4208/4208479.png", width=100) |
|
|
| with col2: |
| st.markdown("### 2. Data Processing") |
| st.markdown("Advanced data cleaning and preparation for analysis.") |
| st.image("https://cdn-icons-png.flaticon.com/512/1875/1875627.png", width=100) |
|
|
| with col3: |
| st.markdown("### 3. Feature Extraction") |
| st.markdown("Intelligent feature engineering and pattern recognition.") |
| st.image("https://cdn-icons-png.flaticon.com/512/2103/2103633.png", width=100) |
|
|
| with col4: |
| st.markdown("### 4. Model Deployment") |
| st.markdown("Real-time fraud detection and risk assessment.") |
| st.image("https://cdn-icons-png.flaticon.com/512/2103/2103658.png", width=100) |
| |
| |
| st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>System Analytics</h2></div>", unsafe_allow_html=True) |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| |
| fig = go.Figure() |
| fpr = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] |
| tpr_lr = [0, 0.4, 0.55, 0.68, 0.75, 0.8, 0.85, 0.9, 0.94, 0.98, 1.0] |
| tpr_rf = [0, 0.5, 0.65, 0.78, 0.85, 0.88, 0.91, 0.95, 0.97, 0.99, 1.0] |
| tpr_xgb = [0, 0.55, 0.7, 0.8, 0.87, 0.9, 0.93, 0.96, 0.98, 0.99, 1.0] |
| |
| fig.add_trace(go.Scatter( |
| x=fpr, |
| y=tpr_lr, |
| mode='lines', |
| name='Logistic Regression (AUC = 0.85)', |
| line=dict(color='#2E7D32', width=3) |
| )) |
| fig.add_trace(go.Scatter( |
| x=fpr, |
| y=tpr_rf, |
| mode='lines', |
| name='Random Forest (AUC = 0.92)', |
| line=dict(color='#1976D2', width=3) |
| )) |
| fig.add_trace(go.Scatter( |
| x=fpr, |
| y=tpr_xgb, |
| mode='lines', |
| name='XGBoost (AUC = 0.94)', |
| line=dict(color='#D32F2F', width=3) |
| )) |
| fig.add_trace(go.Scatter( |
| x=[0, 1], |
| y=[0, 1], |
| mode='lines', |
| name='Random', |
| line=dict(dash='dash', color='#757575', width=2) |
| )) |
| |
| fig.update_layout( |
| title='Model Performance Comparison', |
| xaxis_title='False Positive Rate', |
| yaxis_title='True Positive Rate', |
| legend=dict(x=0.01, y=0.99), |
| width=600, |
| height=400, |
| template='plotly_white', |
| margin=dict(l=40, r=40, t=40, b=40) |
| ) |
| |
| st.plotly_chart(fig) |
| |
| with col2: |
| |
| features = ['Transaction Amount', 'Time of Day', 'Merchant Category', 'Location', 'Transaction Frequency', |
| 'Device Used', 'IP Address', 'Account Age', 'Previous Fraud Flag', 'Transaction Type'] |
| importance = [0.23, 0.18, 0.15, 0.12, 0.09, 0.08, 0.06, 0.04, 0.03, 0.02] |
| |
| fig = px.bar( |
| x=importance, |
| y=features, |
| orientation='h', |
| title='Feature Importance Analysis', |
| labels={'x': 'Importance Score', 'y': 'Feature'}, |
| color=importance, |
| color_continuous_scale=['#2E7D32', '#43A047', '#81C784'] |
| ) |
| |
| fig.update_layout( |
| width=600, |
| height=400, |
| template='plotly_white', |
| margin=dict(l=40, r=40, t=40, b=40) |
| ) |
| st.plotly_chart(fig) |
| |
| |
| st.markdown("<div style='text-align: center; margin-top: 2rem;'>", unsafe_allow_html=True) |
| if st.button("Get Started", key="get_started", help="Begin the fraud detection process"): |
| st.session_state['current_page'] = 'upload' |
| st.rerun() |
| st.markdown("</div>", unsafe_allow_html=True) |
|
|
| |
| elif st.session_state['current_page'] == 'upload': |
| st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 1: Data Ingestion</h2></div>", unsafe_allow_html=True) |
| |
| |
| st.markdown(""" |
| ### Secure Data Upload |
| |
| Upload your transaction data securely in CSV format. The system supports the following: |
| |
| - Transaction details (amount, timestamp, location, etc.) |
| - Target column for fraud classification (default: 'Class' with 0 for normal, 1 for fraud) |
| - **Maximum file size: 200 MB** |
| |
| For testing purposes, you can use the [Credit Card Fraud Detection dataset](https://www.kaggle.com/mlg-ulb/creditcardfraud) from Kaggle. |
| |
| ### Data Requirements: |
| - CSV format with UTF-8 encoding |
| - No missing values in critical fields |
| - Proper date/time formatting |
| - Numeric values for transaction amounts |
| """) |
| |
| uploaded_file = st.file_uploader( |
| "Upload transaction data (CSV file)", |
| type="csv", |
| help="Maximum file size: 200 MB" |
| ) |
|
|
| if uploaded_file is not None: |
| |
| file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type} |
| |
| |
| file_buffer = uploaded_file.getvalue() |
| file_size_mb = len(file_buffer) / (1024 * 1024) |
| |
| if file_size_mb > 200: |
| st.error(f"File size exceeds the 200 MB limit. Your file is {file_size_mb:.2f} MB. Please upload a smaller file.") |
| st.stop() |
| else: |
| st.info(f"File size: {file_size_mb:.2f} MB") |
| |
| |
| progress_bar = st.progress(0) |
| status_text = st.empty() |
| |
| status_text.text("Initializing data ingestion...") |
| progress_bar.progress(25) |
| time.sleep(0.3) |
| |
| try: |
| |
| from io import BytesIO |
| df = pd.read_csv(BytesIO(file_buffer)) |
| st.session_state['data'] = df |
| |
| progress_bar.progress(50) |
| status_text.text("Validating data structure...") |
| time.sleep(0.3) |
| |
| progress_bar.progress(75) |
| status_text.text("Preparing data preview...") |
| time.sleep(0.3) |
| |
| progress_bar.progress(100) |
| status_text.text("Data ingestion completed!") |
| time.sleep(0.3) |
| |
| status_text.empty() |
| progress_bar.empty() |
| |
| |
| st.success(f"Data ingested successfully! Shape: {df.shape[0]} rows and {df.shape[1]} columns") |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| st.subheader("Data Preview") |
| st.dataframe(df.head()) |
| |
| with col2: |
| st.subheader("Data Structure") |
| |
| |
| data_info = pd.DataFrame({ |
| 'Data Type': df.dtypes, |
| 'Non-Null Count': df.count(), |
| 'Missing Values': df.isnull().sum(), |
| 'Unique Values': [df[col].nunique() for col in df.columns] |
| }) |
| |
| st.dataframe(data_info) |
| |
| |
| if 'Class' in df.columns: |
| fraud_count = df['Class'].sum() |
| total_count = len(df) |
| fraud_percentage = (fraud_count / total_count) * 100 |
| |
| st.info(f"Target column 'Class' detected with {fraud_count} fraud cases ({fraud_percentage:.2f}% of data)") |
| else: |
| st.warning("No 'Class' column detected. You'll need to specify the target column in the next step.") |
| except Exception as e: |
| st.error(f"Error during data ingestion: {str(e)}") |
| st.info("Please ensure the file is a valid CSV with proper formatting.") |
| |
| |
| col1, col2 = st.columns([1, 5]) |
| |
| with col1: |
| if st.button("← Back to Home", key="back_to_home"): |
| st.session_state['current_page'] = 'home' |
| st.rerun() |
| |
| with col2: |
| if st.session_state['data'] is not None: |
| if st.button("Continue to Data Processing →", key="to_preprocess"): |
| st.session_state['current_page'] = 'preprocess' |
| st.rerun() |
|
|
| |
| elif st.session_state['current_page'] == 'preprocess': |
| st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 2: Data Processing</h2></div>", unsafe_allow_html=True) |
| |
| if st.session_state['data'] is None: |
| st.error("No data found. Please upload data first.") |
| if st.button("Go back to Data Ingestion"): |
| st.session_state['current_page'] = 'upload' |
| st.rerun() |
| else: |
| df = st.session_state['data'] |
| |
| st.markdown(""" |
| ### Advanced Data Processing |
| |
| Enhance your data quality through our comprehensive processing pipeline. The system will: |
| - Handle missing values intelligently |
| - Remove statistical outliers |
| - Normalize numerical features |
| - Balance class distribution |
| |
| Select the processing options below to customize the pipeline. |
| """) |
| |
| |
| if 'Class' in df.columns: |
| target_col = 'Class' |
| st.info(f"Target column 'Class' detected with values: {df[target_col].unique()}") |
| else: |
| target_col = st.selectbox("Select the target column (fraud indicator)", df.columns) |
| |
| st.session_state['target_col'] = target_col |
| |
| |
| st.subheader("Processing Options") |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| handle_missing = st.checkbox("Handle Missing Values", value=True, |
| help="Fill missing numerical values with mean and categorical values with mode") |
| remove_outliers = st.checkbox("Remove Outliers", value=False, |
| help="Remove extreme values that might affect model performance") |
| |
| with col2: |
| normalize_data = st.checkbox("Normalize Data", value=True, |
| help="Scale numerical features to have zero mean and unit variance") |
| balance_classes = st.checkbox("Balance Classes", value=True, |
| help="Handle class imbalance using SMOTE in the training phase") |
| |
| |
| if st.button("Process Data"): |
| with st.spinner("Processing data..."): |
| |
| df_processed = df.copy() |
| |
| |
| progress_bar = st.progress(0) |
| status_text = st.empty() |
| |
| |
| if handle_missing: |
| status_text.text("Processing missing values...") |
| progress_bar.progress(25) |
| time.sleep(0.3) |
| |
| for col in df_processed.columns: |
| if df_processed[col].dtype in ['int64', 'float64']: |
| df_processed[col] = df_processed[col].fillna(df_processed[col].mean()) |
| else: |
| df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0]) |
| |
| |
| if remove_outliers: |
| status_text.text("Processing outliers...") |
| progress_bar.progress(50) |
| time.sleep(0.3) |
| |
| |
| num_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns |
| for col in num_cols: |
| if col != target_col: |
| Q1 = df_processed[col].quantile(0.25) |
| Q3 = df_processed[col].quantile(0.75) |
| IQR = Q3 - Q1 |
| lower_bound = Q1 - 3 * IQR |
| upper_bound = Q3 + 3 * IQR |
| df_processed = df_processed[(df_processed[col] >= lower_bound) & |
| (df_processed[col] <= upper_bound)] |
| |
| |
| status_text.text("Finalizing data processing...") |
| progress_bar.progress(100) |
| time.sleep(0.3) |
| |
| st.session_state['preprocessed_data'] = df_processed |
| |
| status_text.empty() |
| progress_bar.empty() |
| |
| st.success("Data processing completed!") |
| |
| |
| if target_col in df_processed.columns: |
| st.subheader("Class Distribution After Processing") |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| |
| labels = ['Normal', 'Fraud'] |
| values = [len(df_processed[df_processed[target_col] == 0]), |
| len(df_processed[df_processed[target_col] == 1])] |
| |
| fig = px.pie( |
| values=values, |
| names=labels, |
| title='Transaction Distribution', |
| color_discrete_sequence=['#2E7D32', '#D32F2F'], |
| hole=0.4 |
| ) |
| |
| fig.update_traces(textposition='inside', textinfo='percent+label') |
| fig.update_layout( |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=30, b=20) |
| ) |
| st.plotly_chart(fig) |
| |
| with col2: |
| |
| fraud_count = df_processed[target_col].sum() |
| total_count = len(df_processed) |
| fraud_percentage = (fraud_count / total_count) * 100 |
| |
| st.metric("Total Transactions", f"{total_count:,}") |
| st.metric("Fraud Transactions", f"{fraud_count:,}") |
| st.metric("Fraud Percentage", f"{fraud_percentage:.2f}%") |
| |
| if fraud_percentage < 1: |
| st.warning("Your dataset is highly imbalanced. Class balancing will be applied during model training.") |
| |
| |
| col1, col2 = st.columns([1, 5]) |
| |
| with col1: |
| if st.button("← Back to Upload", key="back_to_upload"): |
| st.session_state['current_page'] = 'upload' |
| st.rerun() |
| |
| with col2: |
| if st.session_state['preprocessed_data'] is not None: |
| if st.button("Continue to Feature Extraction →", key="to_feature_eng"): |
| st.session_state['current_page'] = 'feature_engineering' |
| st.rerun() |
|
|
| |
| elif st.session_state['current_page'] == 'feature_engineering': |
| st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 3: Feature Extraction</h2></div>", unsafe_allow_html=True) |
| |
| if st.session_state['preprocessed_data'] is None: |
| st.error("No processed data found. Please complete data processing first.") |
| if st.button("Go back to Data Processing"): |
| st.session_state['current_page'] = 'preprocess' |
| st.rerun() |
| else: |
| df_processed = st.session_state['preprocessed_data'] |
| target_col = st.session_state['target_col'] |
| |
| st.markdown(""" |
| ### Intelligent Feature Extraction |
| |
| Enhance your fraud detection capabilities through advanced feature engineering. Our system provides: |
| - Time-based pattern analysis |
| - Transaction amount profiling |
| - Behavioral feature extraction |
| - Cross-feature interaction analysis |
| |
| Select the features to extract below to optimize your model's performance. |
| """) |
| |
| |
| st.subheader("Feature Extraction Options") |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| create_time_features = st.checkbox("Time-based Features", value=True, |
| help="Extract temporal patterns and behavioral indicators") |
| create_amount_features = st.checkbox("Amount-based Features", value=True, |
| help="Generate transaction amount profiles and risk indicators") |
| |
| with col2: |
| create_aggregations = st.checkbox("Aggregation Features", value=False, |
| help="Create aggregated metrics for transaction patterns") |
| create_interactions = st.checkbox("Interaction Features", value=False, |
| help="Generate cross-feature interactions for complex pattern detection") |
| |
| |
| if st.button("Extract Features"): |
| with st.spinner("Extracting features..."): |
| |
| df_engineered = df_processed.copy() |
| |
| |
| progress_bar = st.progress(0) |
| status_text = st.empty() |
| |
| |
| if create_time_features and 'Time' in df_engineered.columns: |
| status_text.text("Extracting temporal features...") |
| progress_bar.progress(25) |
| time.sleep(0.3) |
| |
| |
| df_engineered['Hour'] = (df_engineered['Time'] / 3600) % 24 |
| |
| |
| df_engineered['Odd_Hour'] = ((df_engineered['Hour'] >= 0) & (df_engineered['Hour'] < 5)).astype(int) |
| |
| |
| df_engineered['Part_of_Day'] = pd.cut( |
| df_engineered['Hour'], |
| bins=[0, 6, 12, 18, 24], |
| labels=['Night', 'Morning', 'Afternoon', 'Evening'] |
| ) |
| |
| |
| if create_amount_features and 'Amount' in df_engineered.columns: |
| status_text.text("Extracting amount-based features...") |
| progress_bar.progress(50) |
| time.sleep(0.3) |
| |
| |
| df_engineered['Log_Amount'] = np.log1p(df_engineered['Amount']) |
| |
| |
| threshold = df_engineered['Amount'].quantile(0.95) |
| df_engineered['High_Value'] = (df_engineered['Amount'] > threshold).astype(int) |
| |
| |
| df_engineered['Amount_Bin'] = pd.qcut( |
| df_engineered['Amount'], |
| q=5, |
| labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'] |
| ) |
| |
| |
| if create_aggregations: |
| status_text.text("Generating aggregation features...") |
| progress_bar.progress(75) |
| time.sleep(0.3) |
| |
| |
| potential_id_cols = [col for col in df_engineered.columns if 'id' in col.lower() or 'card' in col.lower()] |
| |
| if potential_id_cols: |
| id_col = potential_id_cols[0] |
| |
| |
| tx_count = df_engineered.groupby(id_col).size().reset_index(name='Tx_Count') |
| df_engineered = df_engineered.merge(tx_count, on=id_col, how='left') |
| |
| |
| if 'Amount' in df_engineered.columns: |
| avg_amount = df_engineered.groupby(id_col)['Amount'].mean().reset_index(name='Avg_Amount') |
| df_engineered = df_engineered.merge(avg_amount, on=id_col, how='left') |
| |
| |
| df_engineered['Amount_Deviation'] = df_engineered['Amount'] - df_engineered['Avg_Amount'] |
| |
| |
| if create_interactions: |
| status_text.text("Generating interaction features...") |
| progress_bar.progress(90) |
| time.sleep(0.3) |
| |
| |
| num_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns |
| num_cols = [col for col in num_cols if col != target_col and 'id' not in col.lower()] |
| |
| |
| if len(num_cols) > 3: |
| num_cols = num_cols[:3] |
| |
| |
| for i in range(len(num_cols)): |
| for j in range(i+1, len(num_cols)): |
| col1_name = num_cols[i] |
| col2_name = num_cols[j] |
| df_engineered[f'{col1_name}_x_{col2_name}'] = df_engineered[col1_name] * df_engineered[col2_name] |
| |
| |
| cat_cols = df_engineered.select_dtypes(include=['object', 'category']).columns |
| for col in cat_cols: |
| dummies = pd.get_dummies(df_engineered[col], prefix=col, drop_first=True) |
| df_engineered = pd.concat([df_engineered, dummies], axis=1) |
| df_engineered.drop(columns=[col], inplace=True) |
| |
| |
| status_text.text("Finalizing feature extraction...") |
| progress_bar.progress(100) |
| time.sleep(0.3) |
| |
| st.session_state['engineered_data'] = df_engineered |
| |
| status_text.empty() |
| progress_bar.empty() |
| |
| st.success("Feature extraction completed!") |
| |
| |
| if target_col in df_engineered.columns: |
| st.subheader("Feature Correlation Analysis") |
| |
| |
| corr_with_target = df_engineered.corr()[target_col].sort_values(ascending=False) |
| |
| |
| corr_with_target = corr_with_target.drop(target_col) |
| |
| |
| top_pos = corr_with_target.head(10) |
| top_neg = corr_with_target.tail(10).iloc[::-1] |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| |
| fig = px.bar( |
| x=top_pos.values, |
| y=top_pos.index, |
| orientation='h', |
| title='Top Positive Correlations with Fraud', |
| labels={'x': 'Correlation', 'y': 'Feature'}, |
| color=top_pos.values, |
| color_continuous_scale=['#2E7D32', '#43A047', '#81C784'] |
| ) |
| |
| fig.update_layout( |
| height=400, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| st.plotly_chart(fig) |
| |
| with col2: |
| |
| fig = px.bar( |
| x=top_neg.values, |
| y=top_neg.index, |
| orientation='h', |
| title='Top Negative Correlations with Fraud', |
| labels={'x': 'Correlation', 'y': 'Feature'}, |
| color=top_neg.values, |
| color_continuous_scale=['#81C784', '#43A047', '#2E7D32'] |
| ) |
| |
| fig.update_layout( |
| height=400, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| st.plotly_chart(fig) |
| |
| |
| st.subheader("Feature Correlation Matrix") |
| |
| |
| corr_matrix = df_engineered.corr() |
| top_corr_features = corr_with_target.abs().sort_values(ascending=False).head(15).index |
| |
| |
| top_corr_matrix = corr_matrix.loc[top_corr_features, top_corr_features] |
| |
| fig = px.imshow( |
| top_corr_matrix, |
| text_auto='.2f', |
| color_continuous_scale=['#2E7D32', 'white', '#1976D2'], |
| title='Feature Correlation Matrix' |
| ) |
| |
| fig.update_layout( |
| height=600, |
| width=800, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| st.plotly_chart(fig) |
| |
| |
| st.subheader("Feature Distribution Analysis") |
| |
| |
| numeric_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns |
| numeric_cols = [col for col in numeric_cols if col != target_col] |
| |
| selected_feature = st.selectbox("Select feature to analyze", numeric_cols) |
| |
| |
| fig = px.histogram( |
| df_engineered, |
| x=selected_feature, |
| color=target_col, |
| marginal="box", |
| opacity=0.7, |
| barmode="overlay", |
| color_discrete_map={0: "#2E7D32", 1: "#D32F2F"}, |
| labels={target_col: "Class", "0": "Normal", "1": "Fraud"} |
| ) |
| |
| fig.update_layout( |
| title=f"Distribution Analysis of {selected_feature}", |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| st.plotly_chart(fig) |
| |
| |
| col1, col2 = st.columns([1, 5]) |
| |
| with col1: |
| if st.button("← Back to Processing", key="back_to_preprocess"): |
| st.session_state['current_page'] = 'preprocess' |
| st.rerun() |
| |
| with col2: |
| if st.session_state['engineered_data'] is not None: |
| if st.button("Continue to Model Training →", key="to_model_training"): |
| st.session_state['current_page'] = 'model_training' |
| st.rerun() |
|
|
| |
| elif st.session_state['current_page'] == 'model_training': |
| st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 4: Model Training</h2></div>", unsafe_allow_html=True) |
| |
| if st.session_state['engineered_data'] is None: |
| st.error("No engineered data found. Please complete feature extraction first.") |
| if st.button("Go back to Feature Extraction"): |
| st.session_state['current_page'] = 'feature_engineering' |
| st.rerun() |
| else: |
| df_engineered = st.session_state['engineered_data'] |
| target_col = st.session_state['target_col'] |
| |
| st.markdown(""" |
| ### Advanced Model Training |
| |
| Train sophisticated machine learning models for fraud detection. Our system provides: |
| - Multiple model architectures |
| - Automated hyperparameter optimization |
| - Cross-validation for robust evaluation |
| - Performance metrics visualization |
| |
| Select your preferred models and training parameters below. |
| """) |
| |
| |
| st.subheader("Training Configuration") |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| |
| use_sample = st.checkbox("Use Data Sample for Faster Training", value=True, |
| help="Use a sample of the data to speed up training (recommended for large datasets)") |
| |
| if use_sample: |
| sample_size = st.slider("Sample Size (%)", min_value=10, max_value=100, value=20, |
| help="Percentage of data to use for training") |
| |
| |
| test_size = st.slider("Test Set Size (%)", min_value=10, max_value=50, value=20, |
| help="Percentage of data to use for testing") |
| |
| |
| use_smote = st.checkbox("Apply SMOTE for Class Balancing", value=True, |
| help="Use SMOTE to handle class imbalance") |
| |
| with col2: |
| |
| st.write("Select Models to Train:") |
| train_lr = st.checkbox("Logistic Regression", value=True) |
| train_rf = st.checkbox("Random Forest", value=True) |
| train_xgb = st.checkbox("XGBoost", value=True) |
| |
| |
| show_advanced = st.checkbox("Show Advanced Options", value=False) |
| |
| if show_advanced: |
| |
| n_estimators = st.slider("Number of Estimators", min_value=10, max_value=200, value=50, |
| help="Number of trees for Random Forest and XGBoost (higher = more accurate but slower)") |
| |
| |
| max_depth = st.slider("Max Tree Depth", min_value=3, max_value=15, value=6, |
| help="Maximum depth of trees (higher = more complex model)") |
| |
| |
| if st.button("Train Models"): |
| with st.spinner("Training models..."): |
| status_container = st.empty() |
| status_container.markdown( |
| '<div class="loading-pulse">Training in progress... This may take a few minutes.</div>', |
| unsafe_allow_html=True |
| ) |
| |
| X = df_engineered.drop(columns=[target_col]) |
| y = df_engineered[target_col] |
| |
| |
| if use_sample and sample_size < 100: |
| sample_frac = sample_size / 100 |
| |
| X_sample = pd.DataFrame() |
| y_sample = pd.Series() |
| |
| for class_value in y.unique(): |
| X_class = X[y == class_value] |
| y_class = y[y == class_value] |
| |
| n_samples = int(len(X_class) * sample_frac) |
| indices = np.random.choice(X_class.index, size=n_samples, replace=False) |
| |
| X_sample = pd.concat([X_sample, X_class.loc[indices]]) |
| y_sample = pd.concat([y_sample, y_class.loc[indices]]) |
| |
| X = X_sample |
| y = y_sample |
| |
| |
| progress_bar = st.progress(0) |
| status_text = st.empty() |
| |
| status_text.text("Preparing training data...") |
| progress_bar.progress(10) |
| |
| |
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=test_size/100, random_state=42, stratify=y |
| ) |
| |
| status_text.text("Scaling features...") |
| progress_bar.progress(20) |
| |
| |
| scaler = StandardScaler() |
| X_train_scaled = scaler.fit_transform(X_train) |
| X_test_scaled = scaler.transform(X_test) |
| |
| |
| if use_smote: |
| status_text.text("Applying SMOTE for class balancing...") |
| progress_bar.progress(30) |
| |
| smote = SMOTE(random_state=42) |
| X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train) |
| else: |
| X_train_resampled, y_train_resampled = X_train_scaled, y_train |
| |
| |
| with open("models/scaler.pkl", "wb") as f: |
| pickle.dump(scaler, f) |
| |
| |
| with open("models/feature_columns.pkl", "wb") as f: |
| pickle.dump(X.columns.tolist(), f) |
| |
| |
| results = [] |
| trained_models = {} |
| |
| |
| if train_lr: |
| status_text.text("Training Logistic Regression...") |
| progress_bar.progress(40) |
| |
| |
| lr_model = LogisticRegression(max_iter=1000, class_weight='balanced') |
| lr_model.fit(X_train_resampled, y_train_resampled) |
| |
| |
| y_pred = lr_model.predict(X_test_scaled) |
| y_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1] |
| |
| |
| accuracy = accuracy_score(y_test, y_pred) |
| precision = precision_score(y_test, y_pred) |
| recall = recall_score(y_test, y_pred) |
| f1 = f1_score(y_test, y_pred) |
| auc = roc_auc_score(y_test, y_pred_proba) |
| cm = confusion_matrix(y_test, y_pred) |
| |
| |
| lr_results = { |
| 'model_name': 'Logistic Regression', |
| 'model': lr_model, |
| 'accuracy': accuracy, |
| 'precision': precision, |
| 'recall': recall, |
| 'f1_score': f1, |
| 'auc': auc, |
| 'confusion_matrix': cm, |
| 'y_test': y_test, |
| 'y_pred_proba': y_pred_proba |
| } |
| |
| results.append(lr_results) |
| trained_models['lr'] = lr_model |
| |
| |
| with open("models/logistic_regression.pkl", "wb") as f: |
| pickle.dump(lr_model, f) |
| |
| if train_rf: |
| status_text.text("Training Random Forest...") |
| progress_bar.progress(60) |
| |
| |
| n_est = n_estimators if show_advanced else 50 |
| m_depth = max_depth if show_advanced else 6 |
| |
| |
| rf_model = RandomForestClassifier( |
| n_estimators=n_est, |
| max_depth=m_depth, |
| class_weight='balanced', |
| random_state=42 |
| ) |
| rf_model.fit(X_train_resampled, y_train_resampled) |
| |
| |
| y_pred = rf_model.predict(X_test_scaled) |
| y_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1] |
| |
| |
| accuracy = accuracy_score(y_test, y_pred) |
| precision = precision_score(y_test, y_pred) |
| recall = recall_score(y_test, y_pred) |
| f1 = f1_score(y_test, y_pred) |
| auc = roc_auc_score(y_test, y_pred_proba) |
| cm = confusion_matrix(y_test, y_pred) |
| |
| |
| rf_results = { |
| 'model_name': 'Random Forest', |
| 'model': rf_model, |
| 'accuracy': accuracy, |
| 'precision': precision, |
| 'recall': recall, |
| 'f1_score': f1, |
| 'auc': auc, |
| 'confusion_matrix': cm, |
| 'y_test': y_test, |
| 'y_pred_proba': y_pred_proba |
| } |
| |
| results.append(rf_results) |
| trained_models['rf'] = rf_model |
| |
| |
| with open("models/random_forest.pkl", "wb") as f: |
| pickle.dump(rf_model, f) |
| |
| if train_xgb: |
| status_text.text("Training XGBoost...") |
| progress_bar.progress(80) |
| |
| |
| n_est = n_estimators if show_advanced else 50 |
| m_depth = max_depth if show_advanced else 6 |
| |
| |
| xgb_model = XGBClassifier( |
| n_estimators=n_est, |
| max_depth=m_depth, |
| scale_pos_weight=10, |
| random_state=42, |
| use_label_encoder=False, |
| eval_metric='logloss' |
| ) |
| xgb_model.fit(X_train_resampled, y_train_resampled) |
| |
| |
| y_pred = xgb_model.predict(X_test_scaled) |
| y_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1] |
| |
| |
| accuracy = accuracy_score(y_test, y_pred) |
| precision = precision_score(y_test, y_pred) |
| recall = recall_score(y_test, y_pred) |
| f1 = f1_score(y_test, y_pred) |
| auc = roc_auc_score(y_test, y_pred_proba) |
| cm = confusion_matrix(y_test, y_pred) |
| |
| |
| xgb_results = { |
| 'model_name': 'XGBoost', |
| 'model': xgb_model, |
| 'accuracy': accuracy, |
| 'precision': precision, |
| 'recall': recall, |
| 'f1_score': f1, |
| 'auc': auc, |
| 'confusion_matrix': cm, |
| 'y_test': y_test, |
| 'y_pred_proba': y_pred_proba |
| } |
| |
| results.append(xgb_results) |
| trained_models['xgb'] = xgb_model |
| |
| |
| with open("models/xgboost.pkl", "wb") as f: |
| pickle.dump(xgb_model, f) |
| |
| |
| with open("models/test_data.pkl", "wb") as f: |
| pickle.dump({"X_test": X_test_scaled, "y_test": y_test}, f) |
| |
| st.session_state['trained_models'] = trained_models |
| |
| |
| status_text.text("Generating predictions...") |
| progress_bar.progress(90) |
| |
| |
| best_model = None |
| best_f1 = -1 |
| best_model_name = "" |
| |
| for result in results: |
| if result['f1_score'] > best_f1: |
| best_f1 = result['f1_score'] |
| best_model = result['model'] |
| best_model_name = result['model_name'] |
| |
| if best_model is not None: |
| |
| X_full = df_engineered.drop(columns=[target_col]) |
| |
| |
| X_full_scaled = scaler.transform(X_full) |
| |
| |
| y_pred = best_model.predict(X_full_scaled) |
| y_pred_proba = best_model.predict_proba(X_full_scaled)[:, 1] |
| |
| |
| df_with_predictions = df_engineered.copy() |
| df_with_predictions['Fraud_Probability'] = y_pred_proba |
| df_with_predictions['Predicted_Fraud'] = y_pred |
| |
| |
| st.session_state['predictions'] = { |
| 'df': df_with_predictions, |
| 'model_name': best_model_name, |
| 'results': results |
| } |
| |
| status_text.text("Training completed!") |
| progress_bar.progress(100) |
| time.sleep(0.3) |
| |
| status_text.empty() |
| progress_bar.empty() |
| |
| st.success("Models trained successfully!") |
| |
| |
| if results: |
| st.subheader("Model Performance Analysis") |
| |
| |
| comparison_df = pd.DataFrame([ |
| { |
| 'Model': r['model_name'], |
| 'Accuracy': r['accuracy'], |
| 'Precision': r['precision'], |
| 'Recall': r['recall'], |
| 'F1 Score': r['f1_score'], |
| 'AUC': r['auc'] |
| } for r in results |
| ]) |
| |
| st.dataframe(comparison_df.style.highlight_max(axis=0, color='#81C784')) |
| |
| |
| fig = px.bar( |
| comparison_df.melt(id_vars=['Model'], var_name='Metric', value_name='Value'), |
| x='Model', |
| y='Value', |
| color='Metric', |
| barmode='group', |
| title='Model Performance Comparison', |
| labels={'Value': 'Score', 'Model': 'Model'}, |
| color_discrete_sequence=['#2E7D32', '#43A047', '#81C784', '#1976D2', '#D32F2F'] |
| ) |
| |
| fig.update_layout( |
| height=500, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| st.plotly_chart(fig) |
| |
| |
| st.subheader("ROC Curve Analysis") |
| |
| fig = go.Figure() |
| |
| colors = ['#2E7D32', '#1976D2', '#D32F2F'] |
| |
| for i, result in enumerate(results): |
| model_name = result['model_name'] |
| y_test = result['y_test'] |
| y_pred_proba = result['y_pred_proba'] |
| |
| fpr, tpr, _ = roc_curve(y_test, y_pred_proba) |
| auc = result['auc'] |
| |
| fig.add_trace(go.Scatter( |
| x=fpr, |
| y=tpr, |
| mode='lines', |
| name=f'{model_name} (AUC = {auc:.3f})', |
| line=dict(color=colors[i % len(colors)], width=3) |
| )) |
| |
| fig.add_trace(go.Scatter( |
| x=[0, 1], |
| y=[0, 1], |
| mode='lines', |
| name='Random', |
| line=dict(dash='dash', color='#757575', width=2) |
| )) |
| |
| fig.update_layout( |
| title='ROC Curve Analysis', |
| xaxis_title='False Positive Rate', |
| yaxis_title='True Positive Rate', |
| legend=dict(x=0.01, y=0.99), |
| height=500, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| |
| st.plotly_chart(fig) |
| |
| |
| st.subheader("Confusion Matrix Analysis") |
| |
| cols = st.columns(len(results)) |
| |
| for i, result in enumerate(results): |
| with cols[i]: |
| model_name = result['model_name'] |
| cm = result['confusion_matrix'] |
| |
| |
| cm_percent = cm / cm.sum() |
| |
| |
| annotations = [] |
| for i in range(cm.shape[0]): |
| for j in range(cm.shape[1]): |
| annotations.append({ |
| 'x': j, |
| 'y': i, |
| 'text': f"{cm[i, j]}<br>({cm_percent[i, j]:.1%})", |
| 'showarrow': False, |
| 'font': {'color': 'white' if cm_percent[i, j] > 0.5 else 'black'} |
| }) |
| |
| |
| fig = go.Figure(data=go.Heatmap( |
| z=cm, |
| x=['Predicted Normal', 'Predicted Fraud'], |
| y=['Actual Normal', 'Actual Fraud'], |
| colorscale=[[0, '#81C784'], [1, '#2E7D32']], |
| showscale=False |
| )) |
| |
| fig.update_layout( |
| title=f"{model_name}", |
| annotations=annotations, |
| height=300, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| |
| st.plotly_chart(fig) |
| |
| |
| st.subheader("Feature Importance Analysis") |
| |
| for result in results: |
| model_name = result['model_name'] |
| model = result['model'] |
| |
| if model_name in ['Random Forest', 'XGBoost']: |
| |
| if hasattr(model, 'feature_importances_'): |
| importances = model.feature_importances_ |
| feature_names = X.columns |
| |
| |
| indices = np.argsort(importances)[::-1] |
| top_indices = indices[:10] |
| |
| |
| fig = px.bar( |
| x=importances[top_indices], |
| y=[feature_names[i] for i in top_indices], |
| orientation='h', |
| title=f'Top Features - {model_name}', |
| labels={'x': 'Importance', 'y': 'Feature'}, |
| color=importances[top_indices], |
| color_continuous_scale=['#81C784', '#43A047', '#2E7D32'] |
| ) |
| |
| fig.update_layout( |
| height=400, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| st.plotly_chart(fig) |
| |
| |
| col1, col2 = st.columns([1, 5]) |
| |
| with col1: |
| if st.button("← Back to Feature Extraction", key="back_to_feature_eng"): |
| st.session_state['current_page'] = 'feature_engineering' |
| st.rerun() |
| |
| with col2: |
| if st.session_state['predictions'] is not None: |
| if st.button("Continue to Results →", key="to_results"): |
| st.session_state['current_page'] = 'results' |
| st.rerun() |
|
|
| |
| elif st.session_state['current_page'] == 'results': |
| st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 5: Fraud Detection Results</h2></div>", unsafe_allow_html=True) |
| |
| if st.session_state['predictions'] is None: |
| st.error("No predictions found. Please complete model training first.") |
| if st.button("Go back to Model Training"): |
| st.session_state['current_page'] = 'model_training' |
| st.rerun() |
| else: |
| predictions = st.session_state['predictions'] |
| df_with_predictions = predictions['df'] |
| model_name = predictions['model_name'] |
| |
| st.markdown(f"<h3 class='sub-header'>Fraud Detection Results using {model_name}</h3>", unsafe_allow_html=True) |
| |
| |
| fraud_count = df_with_predictions['Predicted_Fraud'].sum() |
| total_count = len(df_with_predictions) |
| fraud_percentage = (fraud_count / total_count) * 100 |
| |
| |
| col1, col2, col3 = st.columns(3) |
|
|
| with col1: |
| st.metric( |
| label="Total Transactions", |
| value=f"{total_count:,}", |
| delta=None |
| ) |
|
|
| with col2: |
| st.metric( |
| label="Detected Frauds", |
| value=f"{fraud_count:,}", |
| delta=None |
| ) |
|
|
| with col3: |
| st.metric( |
| label="Fraud Percentage", |
| value=f"{fraud_percentage:.2f}%", |
| delta=None |
| ) |
| |
| |
| st.subheader("Fraud Probability Distribution") |
| |
| fig = px.histogram( |
| df_with_predictions, |
| x='Fraud_Probability', |
| nbins=50, |
| color='Predicted_Fraud', |
| color_discrete_map={0: "#6200EA", 1: "#D50000"}, |
| labels={'Predicted_Fraud': 'Prediction', '0': 'Normal', '1': 'Fraud'}, |
| title='Distribution of Fraud Probabilities', |
| marginal='box' |
| ) |
| |
| fig.update_layout( |
| height=500, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| st.plotly_chart(fig) |
| |
| |
| st.subheader("High Fraud Probability Transactions") |
| |
| |
| threshold = st.slider( |
| "Fraud Probability Threshold", |
| min_value=0.5, |
| max_value=0.95, |
| value=0.7, |
| step=0.05, |
| help="Transactions with fraud probability above this threshold will be shown" |
| ) |
| |
| high_prob_df = df_with_predictions[df_with_predictions['Fraud_Probability'] > threshold] |
| |
| if len(high_prob_df) > 0: |
| st.write(f"Found {len(high_prob_df)} transactions with fraud probability > {threshold}") |
| |
| |
| high_prob_df = high_prob_df.sort_values('Fraud_Probability', ascending=False) |
| |
| |
| display_cols = ['Fraud_Probability', 'Predicted_Fraud'] |
| |
| |
| if 'Amount' in high_prob_df.columns: |
| display_cols.insert(0, 'Amount') |
| |
| if 'Time' in high_prob_df.columns: |
| display_cols.insert(0, 'Time') |
| |
| |
| if st.session_state['target_col'] in high_prob_df.columns: |
| display_cols.append(st.session_state['target_col']) |
| |
| |
| st.dataframe(high_prob_df[display_cols]) |
| |
| |
| csv = high_prob_df.to_csv(index=False) |
| st.download_button( |
| label="Download High Risk Transactions", |
| data=csv, |
| file_name="high_risk_transactions.csv", |
| mime="text/csv" |
| ) |
| else: |
| st.info(f"No transactions found with fraud probability > {threshold}") |
| |
| st.write("Top 10 highest fraud probability transactions:") |
| st.dataframe(df_with_predictions.sort_values('Fraud_Probability', ascending=False).head(10)) |
| |
| |
| target_col = st.session_state['target_col'] |
| if target_col in df_with_predictions.columns: |
| st.subheader("Actual vs Predicted Fraud") |
| |
| |
| cm = confusion_matrix(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) |
| |
| |
| cm_percent = cm / cm.sum() |
| |
| |
| annotations = [] |
| for i in range(cm.shape[0]): |
| for j in range(cm.shape[1]): |
| annotations.append({ |
| 'x': j, |
| 'y': i, |
| 'text': f"{cm[i, j]}<br>({cm_percent[i, j]:.1%})", |
| 'showarrow': False, |
| 'font': {'color': 'white' if cm_percent[i, j] > 0.5 else 'black'} |
| }) |
| |
| |
| fig = go.Figure(data=go.Heatmap( |
| z=cm, |
| x=['Predicted Normal', 'Predicted Fraud'], |
| y=['Actual Normal', 'Actual Fraud'], |
| colorscale=[[0, '#81C784'], [1, '#2E7D32']], |
| showscale=False |
| )) |
| |
| fig.update_layout( |
| title=f"Confusion Matrix - {model_name}", |
| annotations=annotations, |
| height=400, |
| template='plotly_white', |
| margin=dict(l=20, r=20, t=40, b=20) |
| ) |
| |
| st.plotly_chart(fig) |
| |
| |
| accuracy = accuracy_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) |
| |
| |
| precision = precision_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) |
| recall = recall_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) |
| f1 = f1_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) |
| |
| |
| st.subheader("Performance Metrics on Full Dataset") |
| |
| col1, col2, col3, col4 = st.columns(4) |
| |
| with col1: |
| st.metric( |
| label="Accuracy", |
| value=f"{accuracy:.4f}", |
| delta=None |
| ) |
| |
| with col2: |
| st.metric( |
| label="Precision", |
| value=f"{precision:.4f}", |
| delta=None |
| ) |
| |
| with col3: |
| st.metric( |
| label="Recall", |
| value=f"{recall:.4f}", |
| delta=None |
| ) |
| |
| with col4: |
| st.metric( |
| label="F1 Score", |
| value=f"{f1:.4f}", |
| delta=None |
| ) |
| |
| |
| st.subheader("Download Results") |
| |
| csv = df_with_predictions.to_csv(index=False) |
| st.download_button( |
| label="Download All Predictions as CSV", |
| data=csv, |
| file_name="fraud_predictions.csv", |
| mime="text/csv" |
| ) |
| |
| |
| col1, col2 = st.columns([1, 5]) |
| |
| with col1: |
| if st.button("← Back to Model Training", key="back_to_model_training"): |
| st.session_state['current_page'] = 'model_training' |
| st.rerun() |
| |
| with col2: |
| if st.button("Start Over", key="start_over"): |
| |
| for key in list(st.session_state.keys()): |
| del st.session_state[key] |
| st.session_state['current_page'] = 'home' |
| st.rerun() |
|
|
|
|