billing-insite / app.py
noumanjavaid's picture
Update app.py
50aedaa verified
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import google.generativeai as genai
import os
import warnings
from sklearn.ensemble import RandomForestRegressor
from dotenv import load_dotenv
load_dotenv()
# Import utility functions for data conversion
from utils import safe_convert_numeric, safe_convert_datetime
# --- Configuration ---
st.set_page_config(layout="wide", page_title="Payment Analysis Tool")
warnings.filterwarnings("ignore", category=FutureWarning)
# --- AI Helper Setup ---
@st.cache_resource
def configure_gemini():
"""Sets up the AI helper to explain patterns in simple terms"""
try:
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
if not GEMINI_API_KEY:
st.error("⚠️ AI helper not available (missing API key).")
return None
genai.configure(api_key="AIzaSyBAIYRTzVJZyLEcPpIyyc0Ceb4b04WmVY0")
model = genai.GenerativeModel('gemini-1.5-flash')
return model
except Exception as e:
st.error(f"AI helper not available: {e}")
return None
gemini_model = configure_gemini()
# --- Helper Functions ---
def ask_gemini(prompt, attempt_limit=3):
"""Gets easy-to-understand explanations"""
if not gemini_model:
return "AI helper not available."
attempts = 0
while attempts < attempt_limit:
try:
with st.spinner(f"🧠 Getting insights..."):
response = gemini_model.generate_content(prompt)
if response.parts:
return response.text.strip()
else:
return "AI helper couldn't provide a response."
except Exception:
attempts += 1
if attempts >= attempt_limit:
return f"AI helper not responding. Please try again later."
def find_column_options(df_columns, keywords):
"""Finds matching columns in your data based on common names"""
matches = []
for col in df_columns:
col_lower = str(col).strip().lower()
for keyword in keywords:
if keyword in col_lower:
matches.append(col)
break
return matches
# --- Data Loading ---
@st.cache_data
def load_and_clean_data(uploaded_file):
"""Loads your data file and gets it ready for analysis"""
df = None
filename = uploaded_file.name
try:
# Load based on file type
if filename.lower().endswith('.csv'):
df = pd.read_csv(uploaded_file)
elif filename.lower().endswith(('.xlsx', '.xls')):
try:
df = pd.read_excel(uploaded_file, engine='openpyxl')
except Exception:
try:
uploaded_file.seek(0)
df = pd.read_excel(uploaded_file, engine='xlrd')
except Exception as e_xls:
st.error(f"Could not load Excel file: {e_xls}")
return None
else:
st.error("Please upload a CSV or Excel file.")
return None
st.success(f"✅ Data loaded successfully from '{filename}'!")
# Clean column names (remove spaces, special characters)
original_columns = df.columns.tolist()
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df.columns = df.columns.str.lower()
if original_columns != df.columns.tolist():
st.toast("Column names cleaned for easier reading")
# Remove empty columns
cols_to_drop = [col for col in df.columns if col.startswith('unnamed_') or col == '']
if cols_to_drop:
df = df.drop(columns=cols_to_drop)
st.toast(f"Removed {len(cols_to_drop)} empty column(s)")
return df
except Exception as e:
st.error(f"Error loading file: {e}")
return None
# --- Main App ---
st.title("💰 Payment Analysis Tool")
st.markdown("""
This simple tool helps you understand when your customers pay their invoices and why.
Just upload your payment data and follow the easy steps below.
""")
# --- Initialize Session State ---
if 'data_loaded' not in st.session_state:
st.session_state.data_loaded = False
if 'df' not in st.session_state:
st.session_state.df = None
if 'column_mappings' not in st.session_state:
st.session_state.column_mappings = {}
if 'analysis_step' not in st.session_state:
st.session_state.analysis_step = 0
# --- Sidebar ---
st.sidebar.header("📋 Simple Steps")
# 1. File Upload
uploaded_file = st.sidebar.file_uploader("Step 1: Upload Your Data", type=['csv', 'xlsx', 'xls'])
if uploaded_file and not st.session_state.data_loaded:
df_loaded = load_and_clean_data(uploaded_file)
if df_loaded is not None:
st.session_state.df = df_loaded
st.session_state.data_loaded = True
st.session_state.analysis_step = 1 # Move to column mapping
st.rerun() # Refresh page
# --- Main Content Area ---
if not st.session_state.data_loaded:
st.info("👆 Please upload your payment data file (Excel or CSV) using the sidebar to begin.")
st.markdown("""
### What to expect:
1. **Upload your data** - We accept Excel or CSV files with your payment information
2. **Identify your columns** - Tell us which columns contain what information
3. **View simple charts** - See easy-to-understand charts showing payment patterns
This tool will help you understand:
* How quickly customers typically pay
* If certain types of customers pay faster than others
* Which factors might influence payment timing
""")
st.stop() # Stop here if no data
# Display data preview if loaded
st.subheader("Your Data Preview (First 5 Rows)")
st.dataframe(st.session_state.df.head(), use_container_width=True, height=300)
# 2. Column Mapping (help user identify which columns contain what information)
if st.session_state.analysis_step >= 1:
st.sidebar.header("Step 2: Identify Your Columns")
# These define what columns we need and what keywords might identify them
COLUMN_KEYWORDS = {
'customer_id': (['customer id', 'client id'], "Customer ID"),
'amount': (['amount', 'invoice amount', 'value'], "Invoice Amount"),
'invoice_date': (['invoice date', 'billing date'], "Invoice Date"),
'due_date': (['due date', 'payment due'], "Payment Due Date"),
'receipt_date': (['receipt date', 'payment date'], "Date Payment Received"),
'payment_timing_vs_due': (['receipt date vs due date', 'days vs due', 'payment delay due'], "Days Early/Late vs Due Date"),
'payment_timing_vs_invoice': (['receipt date vs invoice date', 'days vs invoice', 'payment delay invoice'], "Days Since Invoice Date"),
'revenue_type': (['revenue type', 'product type', 'service type'], "Product/Service Type"),
'payment_method': (['payment method', 'payment type', 'payment mode'], "Payment Method"),
}
mapping_complete = True
cols_list = [""] + st.session_state.df.columns.tolist() # Add empty option
with st.sidebar.expander("Match your columns", expanded=(st.session_state.analysis_step == 1)):
st.markdown("**Tell us which columns in your data contain what information**")
st.markdown("We'll try to guess, but please check if our guesses are correct:")
for key, (keywords, description) in COLUMN_KEYWORDS.items():
# Try to find matching columns automatically
options = find_column_options(st.session_state.df.columns, keywords)
default_index = 0
if len(options) == 1:
try:
default_index = cols_list.index(options[0])
except ValueError:
default_index = 0
elif len(options) > 1:
st.info(f"Multiple options found for '{description}'. Please select the correct one.")
# Let user confirm or select the column
selected_col = st.selectbox(
f"Which column has: **{description}**?",
options=cols_list,
index=default_index,
key=f"map_{key}"
)
if selected_col:
st.session_state.column_mappings[key] = selected_col
else:
# Check if column is absolutely necessary
if key in ['amount', 'payment_timing_vs_invoice', 'payment_timing_vs_due']:
st.warning(f"⚠️ We need '{description}' for analysis!")
mapping_complete = False
else:
st.session_state.column_mappings[key] = None # Mark as not available
if st.sidebar.button("Confirm Columns", key="confirm_map_button", disabled=(not mapping_complete)):
# Select which payment timing column to use
col_pay_vs_inv = st.session_state.column_mappings.get('payment_timing_vs_invoice')
col_pay_vs_due = st.session_state.column_mappings.get('payment_timing_vs_due')
target_col = col_pay_vs_inv if col_pay_vs_inv else col_pay_vs_due
if not target_col:
st.sidebar.error("❌ We need at least one payment timing column to proceed.")
else:
st.session_state.column_mappings['target_col'] = target_col # Store the chosen target
st.session_state.analysis_step = 2 # Move to next step
st.sidebar.success("✅ Column mapping confirmed!")
st.rerun()
elif not mapping_complete and st.session_state.analysis_step == 1:
st.sidebar.warning("Please select all required columns (marked ⚠️) before continuing.")
# 3. Analysis Steps
if st.session_state.analysis_step >= 2:
st.sidebar.header("Step 3: View Insights")
analysis_options = [
"Payment Timing Overview",
"Payment Patterns by Category",
"Customer Groups",
"Payment Predictions",
"Summary Report"
]
selected_analysis = st.sidebar.radio("What would you like to see?", analysis_options, key="analysis_radio")
# --- Get Mapped Columns ---
colmap = st.session_state.column_mappings
target_col = colmap.get('target_col') # The column tracking payment timing
# --- Execute Selected Analysis ---
df_processed = st.session_state.df.copy() # Work with a copy of the data
# --- Step 3.1: Payment Timing Overview ---
if selected_analysis == "Payment Timing Overview" or st.session_state.analysis_step > 2:
if 'cleaned_df' not in st.session_state or selected_analysis == "Payment Timing Overview":
st.subheader("Preparing Your Data")
with st.spinner("Getting your data ready..."):
# Convert columns to the right data types
numeric_cols_keys = ['amount', 'payment_timing_vs_due', 'payment_timing_vs_invoice']
date_cols_keys = ['invoice_date', 'due_date', 'receipt_date']
for key in numeric_cols_keys:
col = colmap.get(key)
if col and col in df_processed.columns:
df_processed[col] = safe_convert_numeric(df_processed[col])
for key in date_cols_keys:
col = colmap.get(key)
if col and col in df_processed.columns:
df_processed[col] = safe_convert_datetime(df_processed[col])
# Handle missing values
st.write("Fixing missing information...")
# Remove columns with too much missing data
missing_percentages = df_processed.isnull().sum() / len(df_processed) * 100
threshold = 50
high_missing_cols = missing_percentages[missing_percentages > threshold].index
if not high_missing_cols.empty:
df_processed = df_processed.drop(columns=high_missing_cols)
st.write(f"- Removed columns with too much missing data")
# Fill in other missing values with reasonable estimates
numerical_cols = df_processed.select_dtypes(include=np.number).columns
for col in numerical_cols:
if df_processed[col].isnull().any():
median_val = df_processed[col].median()
df_processed[col] = df_processed[col].fillna(median_val)
st.write("- Filled in missing numbers with typical values")
# Fill in missing categories
categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns
for col in categorical_cols:
if df_processed[col].isnull().any():
try:
mode_val = df_processed[col].mode()[0]
df_processed[col] = df_processed[col].fillna(mode_val)
except IndexError:
df_processed[col] = df_processed[col].fillna('Unknown')
st.write("- Filled in missing categories with most common values")
# Store cleaned data for future use
st.session_state.cleaned_df = df_processed
st.session_state.analysis_step = max(st.session_state.analysis_step, 3)
st.success("✅ Data prepared successfully!")
# Now show the payment timing overview
if selected_analysis == "Payment Timing Overview":
st.subheader("📊 Payment Timing Overview")
cleaned_df = st.session_state.cleaned_df
payment_col = colmap.get('target_col')
if payment_col and payment_col in cleaned_df.columns:
# Create a layout with columns
col1, col2 = st.columns(2)
with col1:
# Create a histogram showing payment timing distribution
st.write("#### Distribution of Payment Timing")
fig_hist = px.histogram(
cleaned_df,
x=payment_col,
nbins=20,
title=f"How quickly payments are made",
labels={payment_col: "Days to Payment"},
color_discrete_sequence=['#3366CC'],
)
fig_hist.update_layout(
xaxis_title="Days to Payment",
yaxis_title="Number of Invoices",
showlegend=False
)
# Add a vertical line for average
mean_days = cleaned_df[payment_col].mean()
fig_hist.add_vline(
x=mean_days,
line_dash="dash",
line_color="red",
annotation_text=f"Average: {mean_days:.1f} days",
annotation_position="top right"
)
# Add a vertical line for on-time (0 days)
if colmap.get('payment_timing_vs_due'):
fig_hist.add_vline(
x=0,
line_dash="dash",
line_color="green",
annotation_text="Due Date",
annotation_position="top left"
)
st.plotly_chart(fig_hist, use_container_width=True)
with col2:
# Create a box plot showing the spread of payment timings
st.write("#### Payment Timing Statistics")
fig_box = px.box(
cleaned_df,
y=payment_col,
title="Range of Payment Timings",
points="all",
labels={payment_col: "Days to Payment"},
color_discrete_sequence=['#3366CC'],
)
fig_box.update_layout(
yaxis_title="Days to Payment",
showlegend=False
)
st.plotly_chart(fig_box, use_container_width=True)
# Payment timing stats
st.write("#### Key Payment Statistics")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Average Days", f"{cleaned_df[payment_col].mean():.1f}")
with col2:
st.metric("Median Days", f"{cleaned_df[payment_col].median():.1f}")
with col3:
early_percent = (cleaned_df[payment_col] <= 0).mean() * 100 if 'payment_timing_vs_due' in colmap.values() else None
if early_percent is not None:
st.metric("% Paid On Time", f"{early_percent:.1f}%")
else:
st.metric("Minimum Days", f"{cleaned_df[payment_col].min():.1f}")
with col4:
st.metric("Maximum Days", f"{cleaned_df[payment_col].max():.1f}")
# Get AI explanation if payment_col is days vs due date
if colmap.get('payment_timing_vs_due') == payment_col:
# Ask AI for explanation of payment patterns
prompt = f"""Based on this payment data summary:
- Average days to payment: {cleaned_df[payment_col].mean():.1f}
- Median days to payment: {cleaned_df[payment_col].median():.1f}
- % paid on time: {early_percent:.1f}%
- Maximum days late: {cleaned_df[payment_col].max():.1f}
Please explain in simple terms what this tells us about how customers are paying their invoices.
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
"""
with st.expander("💡 What does this mean for my business?", expanded=True):
explanation = ask_gemini(prompt)
st.markdown(explanation)
else:
st.error("No payment timing column available. Please check your column mappings.")
# --- Step 3.2: Payment Patterns by Category ---
if selected_analysis == "Payment Patterns by Category":
st.subheader("📊 Payment Patterns by Category")
cleaned_df = st.session_state.cleaned_df
payment_col = colmap.get('target_col')
# Define possible category columns and let user select which to analyze
category_cols = []
for key in ['revenue_type', 'payment_method', 'customer_id']:
col = colmap.get(key)
if col and col in cleaned_df.columns:
category_cols.append(col)
if not category_cols:
st.warning("No category columns were identified. Please go back to column mapping and identify at least one of: Revenue Type, Payment Method, or Customer ID.")
else:
# Let user select which category to analyze
selected_category = st.selectbox(
"Select category to analyze:",
options=category_cols,
key="category_selector"
)
if selected_category and payment_col and payment_col in cleaned_df.columns:
# Limit categories to top 10 by frequency to avoid cluttered charts
top_categories = cleaned_df[selected_category].value_counts().nlargest(10).index
filtered_df = cleaned_df[cleaned_df[selected_category].isin(top_categories)]
# Create layout with columns
col1, col2 = st.columns(2)
with col1:
# Box plot showing payment timing by category
st.write(f"#### Payment Timing by {selected_category}")
fig_category_box = px.box(
filtered_df,
x=selected_category,
y=payment_col,
color=selected_category,
title=f"How different {selected_category} categories pay",
labels={payment_col: "Days to Payment"},
)
fig_category_box.update_layout(
xaxis_title=selected_category,
yaxis_title="Days to Payment",
xaxis={'categoryorder':'total descending'}
)
st.plotly_chart(fig_category_box, use_container_width=True)
with col2:
# Bar chart showing average payment time by category
st.write(f"#### Average Payment Time by {selected_category}")
category_avg = filtered_df.groupby(selected_category)[payment_col].mean().reset_index()
category_avg = category_avg.sort_values(payment_col)
fig_category_bar = px.bar(
category_avg,
x=selected_category,
y=payment_col,
color=selected_category,
title=f"Average days to payment by {selected_category}",
labels={payment_col: "Average Days to Payment"},
)
fig_category_bar.update_layout(
xaxis_title=selected_category,
yaxis_title="Average Days to Payment",
showlegend=False
)
st.plotly_chart(fig_category_bar, use_container_width=True)
# Calculate statistics by category
category_stats = filtered_df.groupby(selected_category).agg({
payment_col: ['mean', 'median', 'count'],
})
category_stats.columns = [' '.join(col).strip() for col in category_stats.columns.values]
category_stats = category_stats.reset_index().rename(
columns={f"{payment_col} mean": "Avg Days",
f"{payment_col} median": "Median Days",
f"{payment_col} count": "Count"}
)
category_stats["Avg Days"] = category_stats["Avg Days"].round(1)
category_stats["Median Days"] = category_stats["Median Days"].round(1)
# Show the statistics table
st.write(f"#### Statistics by {selected_category}")
st.dataframe(category_stats, use_container_width=True)
# Get AI explanation based on category
fastest_category = category_stats.loc[category_stats["Avg Days"].idxmin()][selected_category]
slowest_category = category_stats.loc[category_stats["Avg Days"].idxmax()][selected_category]
avg_diff = category_stats["Avg Days"].max() - category_stats["Avg Days"].min()
prompt = f"""Based on this payment data analysis by {selected_category}:
- Fastest paying category: {fastest_category} (average {category_stats['Avg Days'].min():.1f} days)
- Slowest paying category: {slowest_category} (average {category_stats['Avg Days'].max():.1f} days)
- Difference between fastest and slowest: {avg_diff:.1f} days
Please explain in simple terms what this tells us about how different {selected_category} categories are paying.
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
"""
with st.expander("💡 What does this pattern mean?", expanded=True):
explanation = ask_gemini(prompt)
st.markdown(explanation)
# --- Step 3.3: Customer Groups ---
if selected_analysis == "Customer Groups":
st.subheader("👥 Customer Payment Behavior Groups")
cleaned_df = st.session_state.cleaned_df
payment_col = colmap.get('target_col')
amount_col = colmap.get('amount')
customer_col = colmap.get('customer_id')
if not customer_col or not customer_col in cleaned_df.columns:
st.warning("Please identify a Customer ID column in the column mapping step to see customer groups.")
elif not payment_col or not payment_col in cleaned_df.columns:
st.warning("No payment timing column available. Please check your column mappings.")
else:
# Create customer-level summary
st.write("Analyzing customer payment patterns...")
# Group by customer and calculate statistics
customer_stats = cleaned_df.groupby(customer_col).agg({
payment_col: ['mean', 'median', 'min', 'max', 'count'],
})
customer_stats.columns = [' '.join(col).strip() for col in customer_stats.columns.values]
customer_stats = customer_stats.reset_index()
# Add amount stats if available
if amount_col and amount_col in cleaned_df.columns:
amount_stats = cleaned_df.groupby(customer_col).agg({
amount_col: ['mean', 'sum'],
})
amount_stats.columns = [' '.join(col).strip() for col in amount_stats.columns.values]
amount_stats = amount_stats.reset_index()
customer_stats = customer_stats.merge(amount_stats, on=customer_col)
# Define payment behavior groups based on mean payment time
# Only apply if we have payment timing vs due date
if colmap.get('payment_timing_vs_due') == payment_col:
# Create behavior groups
conditions = [
(customer_stats[f"{payment_col} mean"] < -5), # Very early (>5 days before due)
(customer_stats[f"{payment_col} mean"] >= -5) & (customer_stats[f"{payment_col} mean"] < 0), # Early (0-5 days before due)
(customer_stats[f"{payment_col} mean"] >= 0) & (customer_stats[f"{payment_col} mean"] < 15), # On time to slightly late (0-15 days)
(customer_stats[f"{payment_col} mean"] >= 15) & (customer_stats[f"{payment_col} mean"] < 30), # Moderately late (15-30 days)
(customer_stats[f"{payment_col} mean"] >= 30) # Very late (>30 days)
]
values = ['Very Early Payers', 'Early Payers', 'On-time/Slightly Late', 'Moderately Late', 'Very Late Payers']
customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
else:
# If we don't have vs due date, create relative groups
median_pay_time = customer_stats[f"{payment_col} mean"].median()
conditions = [
(customer_stats[f"{payment_col} mean"] < 0.6 * median_pay_time), # Much faster than median
(customer_stats[f"{payment_col} mean"] >= 0.6 * median_pay_time) & (customer_stats[f"{payment_col} mean"] < 0.9 * median_pay_time), # Faster than median
(customer_stats[f"{payment_col} mean"] >= 0.9 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.1 * median_pay_time), # Around median
(customer_stats[f"{payment_col} mean"] > 1.1 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.5 * median_pay_time), # Slower than median
(customer_stats[f"{payment_col} mean"] > 1.5 * median_pay_time) # Much slower than median
]
values = ['Much Faster Payers', 'Faster Payers', 'Average Payers', 'Slower Payers', 'Much Slower Payers']
customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
# Count customers in each group
behavior_counts = customer_stats['Payment Behavior'].value_counts().reset_index()
behavior_counts.columns = ['Payment Behavior', 'Number of Customers']
# Create tabs for different views
tab1, tab2 = st.tabs(["Customer Groups", "Individual Customers"])
with tab1:
st.write("#### Customer Payment Behavior Groups")
# Create a pie chart showing distribution of customer behavior
fig_pie = px.pie(
behavior_counts,
values='Number of Customers',
names='Payment Behavior',
title="Distribution of Customer Payment Behavior"
)
st.plotly_chart(fig_pie, use_container_width=True)
# Show statistics for each behavior group
behavior_group_stats = customer_stats.groupby('Payment Behavior').agg({
f"{payment_col} mean": 'mean',
f"{payment_col} count": 'sum',
customer_col: 'count'
}).reset_index()
behavior_group_stats.columns = ['Payment Behavior', 'Avg Days to Payment', 'Total Invoices', 'Customer Count']
behavior_group_stats["Avg Days to Payment"] = behavior_group_stats["Avg Days to Payment"].round(1)
# Add amount statistics if available
if amount_col and amount_col in cleaned_df.columns and f"{amount_col} sum" in customer_stats.columns:
amount_by_behavior = customer_stats.groupby('Payment Behavior')[f"{amount_col} sum"].sum().reset_index()
behavior_group_stats = behavior_group_stats.merge(amount_by_behavior, on='Payment Behavior')
behavior_group_stats.rename(columns={f"{amount_col} sum": "Total Amount"}, inplace=True)
st.dataframe(behavior_group_stats, use_container_width=True)
# Get AI explanation
largest_group = behavior_counts.loc[behavior_counts['Number of Customers'].idxmax()]['Payment Behavior']
prompt = f"""Based on this customer payment behavior analysis:
- Largest customer group: {largest_group} ({behavior_counts['Number of Customers'].max()} customers)
- Total customer segments: {len(behavior_counts)}
Please explain in simple terms what this tells us about our customer base and their payment habits.
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
"""
with st.expander("💡 What does this mean for my business?", expanded=True):
explanation = ask_gemini(prompt)
st.markdown(explanation)
with tab2:
st.write("#### Individual Customer Payment Behavior")
st.write("Search for specific customers or sort by payment behavior:")
# Prepare the customer table
customer_display = customer_stats.copy()
customer_display = customer_display.rename(columns={
f"{payment_col} mean": "Avg Days to Payment",
f"{payment_col} median": "Median Days",
f"{payment_col} min": "Min Days",
f"{payment_col} max": "Max Days",
f"{payment_col} count": "Invoice Count"
})
if amount_col and amount_col in cleaned_df.columns:
customer_display = customer_display.rename(columns={
f"{amount_col} mean": "Avg Amount",
f"{amount_col} sum": "Total Amount"
})
# Round numeric columns
numeric_cols = customer_display.select_dtypes(include=[np.number]).columns
customer_display[numeric_cols] = customer_display[numeric_cols].round(2)
# Allow filtering
selected_behavior = st.multiselect(
"Filter by payment behavior:",
options=customer_display['Payment Behavior'].unique(),
default=None
)
if selected_behavior:
filtered_customers = customer_display[customer_display['Payment Behavior'].isin(selected_behavior)]
else:
filtered_customers = customer_display
st.dataframe(filtered_customers, use_container_width=True)
# --- Step 3.4: Payment Predictions ---
if selected_analysis == "Payment Predictions":
st.subheader("🔮 Payment Time Prediction Model")
cleaned_df = st.session_state.cleaned_df
payment_col = colmap.get('target_col')
# Check if we have the necessary data
if not payment_col or not payment_col in cleaned_df.columns:
st.warning("No payment timing column available. Please check your column mappings.")
else:
st.write("This model helps you predict when customers will pay based on invoice characteristics.")
# Identify potential predictor variables
numeric_cols = cleaned_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = cleaned_df.select_dtypes(include=['object', 'category']).columns.tolist()
# Remove the target variable from predictors
if payment_col in numeric_cols:
numeric_cols.remove(payment_col)
# Prepare predictor variables
potential_predictors = []
# Add amount if available
amount_col = colmap.get('amount')
if amount_col and amount_col in cleaned_df.columns:
potential_predictors.append(amount_col)
# Add customer_id if available
customer_col = colmap.get('customer_id')
if customer_col and customer_col in cleaned_df.columns and len(cleaned_df[customer_col].unique()) < 100:
potential_predictors.append(customer_col)
# Add revenue_type and payment_method if available
for key in ['revenue_type', 'payment_method']:
col = colmap.get(key)
if col and col in cleaned_df.columns:
potential_predictors.append(col)
# Add other numeric columns that might be useful
for col in numeric_cols:
if col not in potential_predictors and col != payment_col and 'date' not in col.lower():
potential_predictors.append(col)
if len(potential_predictors) < 1:
st.warning("Not enough predictor variables available for modeling. Please identify more columns in your data.")
else:
# Let user select predictor variables
st.write("#### Select variables to use for prediction")
selected_predictors = st.multiselect(
"Choose which factors might influence payment timing:",
options=potential_predictors,
default=potential_predictors[:min(3, len(potential_predictors))] # Default to first 3
)
if len(selected_predictors) < 1:
st.warning("Please select at least one predictor variable.")
else:
# Prepare data for modeling
X = cleaned_df[selected_predictors].copy()
y = cleaned_df[payment_col].copy()
# Process categorical variables
X_processed = pd.DataFrame()
for col in X.columns:
if col in categorical_cols:
# One-hot encode categorical variables
dummies = pd.get_dummies(X[col], prefix=col, drop_first=True)
X_processed = pd.concat([X_processed, dummies], axis=1)
else:
# Keep numeric columns as is
X_processed[col] = X[col]
# Check if we have any data after processing
if X_processed.shape[1] == 0:
st.warning("No usable predictor variables after processing. Please select different variables.")
else:
# Build and train the model
with st.spinner("Training prediction model..."):
try:
# Fill any remaining missing values with median
X_processed = X_processed.fillna(X_processed.median())
# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_processed, y)
# Get feature importances
feature_imp = pd.DataFrame({
'Feature': X_processed.columns,
'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
st.success("✅ Payment prediction model trained!")
# Show feature importances
st.write("#### Factors that influence payment timing")
fig_imp = px.bar(
feature_imp.head(10),
x='Importance',
y='Feature',
orientation='h',
title="Which factors best predict payment timing",
labels={'Importance': 'Importance Score'},
)
st.plotly_chart(fig_imp, use_container_width=True)
# Get AI explanation of feature importance
top_features = ", ".join(feature_imp.head(3)['Feature'].tolist())
prompt = f"""Based on this payment prediction model analysis:
- Top predictive factors: {top_features}
Please explain in simple terms what this tells us about what influences payment timing.
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
"""
with st.expander("💡 What influences payment timing?", expanded=True):
explanation = ask_gemini(prompt)
st.markdown(explanation)
# Create prediction interface
st.write("#### Predict payment timing for new invoices")
st.write("Enter values for a new invoice to predict when it will be paid:")
# Create input widgets for each predictor
new_invoice_data = {}
for col in selected_predictors:
if col in categorical_cols:
options = cleaned_df[col].unique().tolist()
new_invoice_data[col] = st.selectbox(f"Select {col}:", options=options)
else:
min_val = cleaned_df[col].min()
max_val = cleaned_df[col].max()
step = (max_val - min_val) / 100
new_invoice_data[col] = st.slider(f"Set {col}:", min_value=float(min_val), max_value=float(max_val), step=float(step))
# Make prediction when button is clicked
if st.button("Predict Payment Timing"):
# Prepare the input data in the same format as the training data
X_new = pd.DataFrame([new_invoice_data])
X_new_processed = pd.DataFrame()
# Process the input data the same way as training data
for col in X_new.columns:
if col in categorical_cols:
dummies = pd.get_dummies(X_new[col], prefix=col, drop_first=True)
X_new_processed = pd.concat([X_new_processed, dummies], axis=1)
else:
X_new_processed[col] = X_new[col]
# Add missing columns that were in the training data
for col in X_processed.columns:
if col not in X_new_processed.columns:
X_new_processed[col] = 0
# Keep only the columns used during training
X_new_processed = X_new_processed[X_processed.columns]
# Make prediction
prediction = model.predict(X_new_processed)[0]
# Display prediction
st.success(f"Predicted payment timing: **{prediction:.1f} days**")
# Add interpretation if we're predicting vs due date
if colmap.get('payment_timing_vs_due') == payment_col:
if prediction < 0:
st.info(f"This invoice is predicted to be paid **{abs(prediction):.1f} days before** the due date.")
elif prediction == 0:
st.info("This invoice is predicted to be paid exactly on the due date.")
else:
st.info(f"This invoice is predicted to be paid **{prediction:.1f} days after** the due date.")
except Exception as e:
st.error(f"Error building prediction model: {e}")
# --- Step 3.5: Summary Report ---
if selected_analysis == "Summary Report":
st.subheader("📝 Payment Analysis Summary Report")
cleaned_df = st.session_state.cleaned_df
payment_col = colmap.get('target_col')
if not payment_col or not payment_col in cleaned_df.columns:
st.warning("No payment timing column available. Please check your column mappings.")
else:
# Overall payment statistics
st.write("### Overall Payment Performance")
mean_days = cleaned_df[payment_col].mean()
median_days = cleaned_df[payment_col].median()
# Calculate on-time percentage if we have vs due date
if colmap.get('payment_timing_vs_due') == payment_col:
ontime_pct = (cleaned_df[payment_col] <= 0).mean() * 100
late_pct = 100 - ontime_pct
# Display metrics in columns
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Average Days to Payment", f"{mean_days:.1f}")
with col2:
st.metric("Median Days to Payment", f"{median_days:.1f}")
with col3:
if colmap.get('payment_timing_vs_due') == payment_col:
st.metric("% Paid On Time", f"{ontime_pct:.1f}%")
else:
st.metric("Total Invoices", f"{len(cleaned_df)}")
# Summary visualization
st.write("### Payment Timeline")
# Create a histogram of payment distribution
fig_summary = px.histogram(
cleaned_df,
x=payment_col,
nbins=30,
title="Distribution of Payment Timing",
color_discrete_sequence=['#3366CC']
)
# Add markers for key statistics
fig_summary.add_vline(x=mean_days, line_dash="dash", line_color="red", annotation_text=f"Mean: {mean_days:.1f}")
fig_summary.add_vline(x=median_days, line_dash="dash", line_color="green", annotation_text=f"Median: {median_days:.1f}")
if colmap.get('payment_timing_vs_due') == payment_col:
fig_summary.add_vline(x=0, line_dash="solid", line_color="black", annotation_text="Due Date")
st.plotly_chart(fig_summary, use_container_width=True)
# Generate a summary report with key findings
st.write("### Key Findings")
# Gather key data points for the AI summary
summary_data = {
"mean_days": mean_days,
"median_days": median_days,
"min_days": cleaned_df[payment_col].min(),
"max_days": cleaned_df[payment_col].max(),
"invoice_count": len(cleaned_df)
}
# Add on-time percentages if available
if colmap.get('payment_timing_vs_due') == payment_col:
summary_data["ontime_pct"] = ontime_pct
summary_data["late_pct"] = late_pct
# Add category information if available
category_insights = []
for cat_key in ['revenue_type', 'payment_method']:
cat_col = colmap.get(cat_key)
if cat_col and cat_col in cleaned_df.columns:
# Calculate best and worst categories
cat_avg = cleaned_df.groupby(cat_col)[payment_col].mean()
if not cat_avg.empty:
best_cat = cat_avg.idxmin()
worst_cat = cat_avg.idxmax()
best_days = cat_avg.min()
worst_days = cat_avg.max()
category_insights.append(f"Best {cat_key}: {best_cat} ({best_days:.1f} days)")
category_insights.append(f"Worst {cat_key}: {worst_cat} ({worst_days:.1f} days)")
# Build prompt for AI summary
prompt = f"""Based on this payment data analysis:
- Average days to payment: {summary_data['mean_days']:.1f}
- Median days to payment: {summary_data['median_days']:.1f}
- Range: {summary_data['min_days']:.1f} to {summary_data['max_days']:.1f} days
- Total invoices analyzed: {summary_data['invoice_count']}
"""
if 'ontime_pct' in summary_data:
prompt += f"- On-time payments: {summary_data['ontime_pct']:.1f}%\n"
prompt += f"- Late payments: {summary_data['late_pct']:.1f}%\n"
for insight in category_insights:
prompt += f"- {insight}\n"
prompt += """\nPlease provide a clear, concise 3-4 sentence summary of what this payment data tells us about the business.
Focus on the most important insights that would be valuable for financial decision-making.
Use simple, non-technical language that anyone in the business could understand.
"""
with st.expander("💡 Summary of Key Findings", expanded=True):
final_summary = ask_gemini(prompt)
st.markdown(final_summary)
# Recommendations section
st.write("### Recommendations")
# Build prompt for AI recommendations
rec_prompt = prompt + "\n\nBased on this data, please provide 3-4 specific, actionable recommendations to improve payment collection. Each recommendation should be 1-2 sentences and focused on practical actions the business can take."
recommendations = ask_gemini(rec_prompt)
st.markdown(recommendations)