Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import google.generativeai as genai | |
import os | |
import warnings | |
from sklearn.ensemble import RandomForestRegressor | |
from dotenv import load_dotenv | |
load_dotenv() | |
# Import utility functions for data conversion | |
from utils import safe_convert_numeric, safe_convert_datetime | |
# --- Configuration --- | |
st.set_page_config(layout="wide", page_title="Payment Analysis Tool") | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
# --- AI Helper Setup --- | |
def configure_gemini(): | |
"""Sets up the AI helper to explain patterns in simple terms""" | |
try: | |
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY') | |
if not GEMINI_API_KEY: | |
st.error("⚠️ AI helper not available (missing API key).") | |
return None | |
genai.configure(api_key="AIzaSyBAIYRTzVJZyLEcPpIyyc0Ceb4b04WmVY0") | |
model = genai.GenerativeModel('gemini-1.5-flash') | |
return model | |
except Exception as e: | |
st.error(f"AI helper not available: {e}") | |
return None | |
gemini_model = configure_gemini() | |
# --- Helper Functions --- | |
def ask_gemini(prompt, attempt_limit=3): | |
"""Gets easy-to-understand explanations""" | |
if not gemini_model: | |
return "AI helper not available." | |
attempts = 0 | |
while attempts < attempt_limit: | |
try: | |
with st.spinner(f"🧠 Getting insights..."): | |
response = gemini_model.generate_content(prompt) | |
if response.parts: | |
return response.text.strip() | |
else: | |
return "AI helper couldn't provide a response." | |
except Exception: | |
attempts += 1 | |
if attempts >= attempt_limit: | |
return f"AI helper not responding. Please try again later." | |
def find_column_options(df_columns, keywords): | |
"""Finds matching columns in your data based on common names""" | |
matches = [] | |
for col in df_columns: | |
col_lower = str(col).strip().lower() | |
for keyword in keywords: | |
if keyword in col_lower: | |
matches.append(col) | |
break | |
return matches | |
# --- Data Loading --- | |
def load_and_clean_data(uploaded_file): | |
"""Loads your data file and gets it ready for analysis""" | |
df = None | |
filename = uploaded_file.name | |
try: | |
# Load based on file type | |
if filename.lower().endswith('.csv'): | |
df = pd.read_csv(uploaded_file) | |
elif filename.lower().endswith(('.xlsx', '.xls')): | |
try: | |
df = pd.read_excel(uploaded_file, engine='openpyxl') | |
except Exception: | |
try: | |
uploaded_file.seek(0) | |
df = pd.read_excel(uploaded_file, engine='xlrd') | |
except Exception as e_xls: | |
st.error(f"Could not load Excel file: {e_xls}") | |
return None | |
else: | |
st.error("Please upload a CSV or Excel file.") | |
return None | |
st.success(f"✅ Data loaded successfully from '{filename}'!") | |
# Clean column names (remove spaces, special characters) | |
original_columns = df.columns.tolist() | |
df.columns = df.columns.str.strip() | |
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True) | |
df.columns = df.columns.str.lower() | |
if original_columns != df.columns.tolist(): | |
st.toast("Column names cleaned for easier reading") | |
# Remove empty columns | |
cols_to_drop = [col for col in df.columns if col.startswith('unnamed_') or col == ''] | |
if cols_to_drop: | |
df = df.drop(columns=cols_to_drop) | |
st.toast(f"Removed {len(cols_to_drop)} empty column(s)") | |
return df | |
except Exception as e: | |
st.error(f"Error loading file: {e}") | |
return None | |
# --- Main App --- | |
st.title("💰 Payment Analysis Tool") | |
st.markdown(""" | |
This simple tool helps you understand when your customers pay their invoices and why. | |
Just upload your payment data and follow the easy steps below. | |
""") | |
# --- Initialize Session State --- | |
if 'data_loaded' not in st.session_state: | |
st.session_state.data_loaded = False | |
if 'df' not in st.session_state: | |
st.session_state.df = None | |
if 'column_mappings' not in st.session_state: | |
st.session_state.column_mappings = {} | |
if 'analysis_step' not in st.session_state: | |
st.session_state.analysis_step = 0 | |
# --- Sidebar --- | |
st.sidebar.header("📋 Simple Steps") | |
# 1. File Upload | |
uploaded_file = st.sidebar.file_uploader("Step 1: Upload Your Data", type=['csv', 'xlsx', 'xls']) | |
if uploaded_file and not st.session_state.data_loaded: | |
df_loaded = load_and_clean_data(uploaded_file) | |
if df_loaded is not None: | |
st.session_state.df = df_loaded | |
st.session_state.data_loaded = True | |
st.session_state.analysis_step = 1 # Move to column mapping | |
st.rerun() # Refresh page | |
# --- Main Content Area --- | |
if not st.session_state.data_loaded: | |
st.info("👆 Please upload your payment data file (Excel or CSV) using the sidebar to begin.") | |
st.markdown(""" | |
### What to expect: | |
1. **Upload your data** - We accept Excel or CSV files with your payment information | |
2. **Identify your columns** - Tell us which columns contain what information | |
3. **View simple charts** - See easy-to-understand charts showing payment patterns | |
This tool will help you understand: | |
* How quickly customers typically pay | |
* If certain types of customers pay faster than others | |
* Which factors might influence payment timing | |
""") | |
st.stop() # Stop here if no data | |
# Display data preview if loaded | |
st.subheader("Your Data Preview (First 5 Rows)") | |
st.dataframe(st.session_state.df.head(), use_container_width=True, height=300) | |
# 2. Column Mapping (help user identify which columns contain what information) | |
if st.session_state.analysis_step >= 1: | |
st.sidebar.header("Step 2: Identify Your Columns") | |
# These define what columns we need and what keywords might identify them | |
COLUMN_KEYWORDS = { | |
'customer_id': (['customer id', 'client id'], "Customer ID"), | |
'amount': (['amount', 'invoice amount', 'value'], "Invoice Amount"), | |
'invoice_date': (['invoice date', 'billing date'], "Invoice Date"), | |
'due_date': (['due date', 'payment due'], "Payment Due Date"), | |
'receipt_date': (['receipt date', 'payment date'], "Date Payment Received"), | |
'payment_timing_vs_due': (['receipt date vs due date', 'days vs due', 'payment delay due'], "Days Early/Late vs Due Date"), | |
'payment_timing_vs_invoice': (['receipt date vs invoice date', 'days vs invoice', 'payment delay invoice'], "Days Since Invoice Date"), | |
'revenue_type': (['revenue type', 'product type', 'service type'], "Product/Service Type"), | |
'payment_method': (['payment method', 'payment type', 'payment mode'], "Payment Method"), | |
} | |
mapping_complete = True | |
cols_list = [""] + st.session_state.df.columns.tolist() # Add empty option | |
with st.sidebar.expander("Match your columns", expanded=(st.session_state.analysis_step == 1)): | |
st.markdown("**Tell us which columns in your data contain what information**") | |
st.markdown("We'll try to guess, but please check if our guesses are correct:") | |
for key, (keywords, description) in COLUMN_KEYWORDS.items(): | |
# Try to find matching columns automatically | |
options = find_column_options(st.session_state.df.columns, keywords) | |
default_index = 0 | |
if len(options) == 1: | |
try: | |
default_index = cols_list.index(options[0]) | |
except ValueError: | |
default_index = 0 | |
elif len(options) > 1: | |
st.info(f"Multiple options found for '{description}'. Please select the correct one.") | |
# Let user confirm or select the column | |
selected_col = st.selectbox( | |
f"Which column has: **{description}**?", | |
options=cols_list, | |
index=default_index, | |
key=f"map_{key}" | |
) | |
if selected_col: | |
st.session_state.column_mappings[key] = selected_col | |
else: | |
# Check if column is absolutely necessary | |
if key in ['amount', 'payment_timing_vs_invoice', 'payment_timing_vs_due']: | |
st.warning(f"⚠️ We need '{description}' for analysis!") | |
mapping_complete = False | |
else: | |
st.session_state.column_mappings[key] = None # Mark as not available | |
if st.sidebar.button("Confirm Columns", key="confirm_map_button", disabled=(not mapping_complete)): | |
# Select which payment timing column to use | |
col_pay_vs_inv = st.session_state.column_mappings.get('payment_timing_vs_invoice') | |
col_pay_vs_due = st.session_state.column_mappings.get('payment_timing_vs_due') | |
target_col = col_pay_vs_inv if col_pay_vs_inv else col_pay_vs_due | |
if not target_col: | |
st.sidebar.error("❌ We need at least one payment timing column to proceed.") | |
else: | |
st.session_state.column_mappings['target_col'] = target_col # Store the chosen target | |
st.session_state.analysis_step = 2 # Move to next step | |
st.sidebar.success("✅ Column mapping confirmed!") | |
st.rerun() | |
elif not mapping_complete and st.session_state.analysis_step == 1: | |
st.sidebar.warning("Please select all required columns (marked ⚠️) before continuing.") | |
# 3. Analysis Steps | |
if st.session_state.analysis_step >= 2: | |
st.sidebar.header("Step 3: View Insights") | |
analysis_options = [ | |
"Payment Timing Overview", | |
"Payment Patterns by Category", | |
"Customer Groups", | |
"Payment Predictions", | |
"Summary Report" | |
] | |
selected_analysis = st.sidebar.radio("What would you like to see?", analysis_options, key="analysis_radio") | |
# --- Get Mapped Columns --- | |
colmap = st.session_state.column_mappings | |
target_col = colmap.get('target_col') # The column tracking payment timing | |
# --- Execute Selected Analysis --- | |
df_processed = st.session_state.df.copy() # Work with a copy of the data | |
# --- Step 3.1: Payment Timing Overview --- | |
if selected_analysis == "Payment Timing Overview" or st.session_state.analysis_step > 2: | |
if 'cleaned_df' not in st.session_state or selected_analysis == "Payment Timing Overview": | |
st.subheader("Preparing Your Data") | |
with st.spinner("Getting your data ready..."): | |
# Convert columns to the right data types | |
numeric_cols_keys = ['amount', 'payment_timing_vs_due', 'payment_timing_vs_invoice'] | |
date_cols_keys = ['invoice_date', 'due_date', 'receipt_date'] | |
for key in numeric_cols_keys: | |
col = colmap.get(key) | |
if col and col in df_processed.columns: | |
df_processed[col] = safe_convert_numeric(df_processed[col]) | |
for key in date_cols_keys: | |
col = colmap.get(key) | |
if col and col in df_processed.columns: | |
df_processed[col] = safe_convert_datetime(df_processed[col]) | |
# Handle missing values | |
st.write("Fixing missing information...") | |
# Remove columns with too much missing data | |
missing_percentages = df_processed.isnull().sum() / len(df_processed) * 100 | |
threshold = 50 | |
high_missing_cols = missing_percentages[missing_percentages > threshold].index | |
if not high_missing_cols.empty: | |
df_processed = df_processed.drop(columns=high_missing_cols) | |
st.write(f"- Removed columns with too much missing data") | |
# Fill in other missing values with reasonable estimates | |
numerical_cols = df_processed.select_dtypes(include=np.number).columns | |
for col in numerical_cols: | |
if df_processed[col].isnull().any(): | |
median_val = df_processed[col].median() | |
df_processed[col] = df_processed[col].fillna(median_val) | |
st.write("- Filled in missing numbers with typical values") | |
# Fill in missing categories | |
categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns | |
for col in categorical_cols: | |
if df_processed[col].isnull().any(): | |
try: | |
mode_val = df_processed[col].mode()[0] | |
df_processed[col] = df_processed[col].fillna(mode_val) | |
except IndexError: | |
df_processed[col] = df_processed[col].fillna('Unknown') | |
st.write("- Filled in missing categories with most common values") | |
# Store cleaned data for future use | |
st.session_state.cleaned_df = df_processed | |
st.session_state.analysis_step = max(st.session_state.analysis_step, 3) | |
st.success("✅ Data prepared successfully!") | |
# Now show the payment timing overview | |
if selected_analysis == "Payment Timing Overview": | |
st.subheader("📊 Payment Timing Overview") | |
cleaned_df = st.session_state.cleaned_df | |
payment_col = colmap.get('target_col') | |
if payment_col and payment_col in cleaned_df.columns: | |
# Create a layout with columns | |
col1, col2 = st.columns(2) | |
with col1: | |
# Create a histogram showing payment timing distribution | |
st.write("#### Distribution of Payment Timing") | |
fig_hist = px.histogram( | |
cleaned_df, | |
x=payment_col, | |
nbins=20, | |
title=f"How quickly payments are made", | |
labels={payment_col: "Days to Payment"}, | |
color_discrete_sequence=['#3366CC'], | |
) | |
fig_hist.update_layout( | |
xaxis_title="Days to Payment", | |
yaxis_title="Number of Invoices", | |
showlegend=False | |
) | |
# Add a vertical line for average | |
mean_days = cleaned_df[payment_col].mean() | |
fig_hist.add_vline( | |
x=mean_days, | |
line_dash="dash", | |
line_color="red", | |
annotation_text=f"Average: {mean_days:.1f} days", | |
annotation_position="top right" | |
) | |
# Add a vertical line for on-time (0 days) | |
if colmap.get('payment_timing_vs_due'): | |
fig_hist.add_vline( | |
x=0, | |
line_dash="dash", | |
line_color="green", | |
annotation_text="Due Date", | |
annotation_position="top left" | |
) | |
st.plotly_chart(fig_hist, use_container_width=True) | |
with col2: | |
# Create a box plot showing the spread of payment timings | |
st.write("#### Payment Timing Statistics") | |
fig_box = px.box( | |
cleaned_df, | |
y=payment_col, | |
title="Range of Payment Timings", | |
points="all", | |
labels={payment_col: "Days to Payment"}, | |
color_discrete_sequence=['#3366CC'], | |
) | |
fig_box.update_layout( | |
yaxis_title="Days to Payment", | |
showlegend=False | |
) | |
st.plotly_chart(fig_box, use_container_width=True) | |
# Payment timing stats | |
st.write("#### Key Payment Statistics") | |
col1, col2, col3, col4 = st.columns(4) | |
with col1: | |
st.metric("Average Days", f"{cleaned_df[payment_col].mean():.1f}") | |
with col2: | |
st.metric("Median Days", f"{cleaned_df[payment_col].median():.1f}") | |
with col3: | |
early_percent = (cleaned_df[payment_col] <= 0).mean() * 100 if 'payment_timing_vs_due' in colmap.values() else None | |
if early_percent is not None: | |
st.metric("% Paid On Time", f"{early_percent:.1f}%") | |
else: | |
st.metric("Minimum Days", f"{cleaned_df[payment_col].min():.1f}") | |
with col4: | |
st.metric("Maximum Days", f"{cleaned_df[payment_col].max():.1f}") | |
# Get AI explanation if payment_col is days vs due date | |
if colmap.get('payment_timing_vs_due') == payment_col: | |
# Ask AI for explanation of payment patterns | |
prompt = f"""Based on this payment data summary: | |
- Average days to payment: {cleaned_df[payment_col].mean():.1f} | |
- Median days to payment: {cleaned_df[payment_col].median():.1f} | |
- % paid on time: {early_percent:.1f}% | |
- Maximum days late: {cleaned_df[payment_col].max():.1f} | |
Please explain in simple terms what this tells us about how customers are paying their invoices. | |
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business. | |
""" | |
with st.expander("💡 What does this mean for my business?", expanded=True): | |
explanation = ask_gemini(prompt) | |
st.markdown(explanation) | |
else: | |
st.error("No payment timing column available. Please check your column mappings.") | |
# --- Step 3.2: Payment Patterns by Category --- | |
if selected_analysis == "Payment Patterns by Category": | |
st.subheader("📊 Payment Patterns by Category") | |
cleaned_df = st.session_state.cleaned_df | |
payment_col = colmap.get('target_col') | |
# Define possible category columns and let user select which to analyze | |
category_cols = [] | |
for key in ['revenue_type', 'payment_method', 'customer_id']: | |
col = colmap.get(key) | |
if col and col in cleaned_df.columns: | |
category_cols.append(col) | |
if not category_cols: | |
st.warning("No category columns were identified. Please go back to column mapping and identify at least one of: Revenue Type, Payment Method, or Customer ID.") | |
else: | |
# Let user select which category to analyze | |
selected_category = st.selectbox( | |
"Select category to analyze:", | |
options=category_cols, | |
key="category_selector" | |
) | |
if selected_category and payment_col and payment_col in cleaned_df.columns: | |
# Limit categories to top 10 by frequency to avoid cluttered charts | |
top_categories = cleaned_df[selected_category].value_counts().nlargest(10).index | |
filtered_df = cleaned_df[cleaned_df[selected_category].isin(top_categories)] | |
# Create layout with columns | |
col1, col2 = st.columns(2) | |
with col1: | |
# Box plot showing payment timing by category | |
st.write(f"#### Payment Timing by {selected_category}") | |
fig_category_box = px.box( | |
filtered_df, | |
x=selected_category, | |
y=payment_col, | |
color=selected_category, | |
title=f"How different {selected_category} categories pay", | |
labels={payment_col: "Days to Payment"}, | |
) | |
fig_category_box.update_layout( | |
xaxis_title=selected_category, | |
yaxis_title="Days to Payment", | |
xaxis={'categoryorder':'total descending'} | |
) | |
st.plotly_chart(fig_category_box, use_container_width=True) | |
with col2: | |
# Bar chart showing average payment time by category | |
st.write(f"#### Average Payment Time by {selected_category}") | |
category_avg = filtered_df.groupby(selected_category)[payment_col].mean().reset_index() | |
category_avg = category_avg.sort_values(payment_col) | |
fig_category_bar = px.bar( | |
category_avg, | |
x=selected_category, | |
y=payment_col, | |
color=selected_category, | |
title=f"Average days to payment by {selected_category}", | |
labels={payment_col: "Average Days to Payment"}, | |
) | |
fig_category_bar.update_layout( | |
xaxis_title=selected_category, | |
yaxis_title="Average Days to Payment", | |
showlegend=False | |
) | |
st.plotly_chart(fig_category_bar, use_container_width=True) | |
# Calculate statistics by category | |
category_stats = filtered_df.groupby(selected_category).agg({ | |
payment_col: ['mean', 'median', 'count'], | |
}) | |
category_stats.columns = [' '.join(col).strip() for col in category_stats.columns.values] | |
category_stats = category_stats.reset_index().rename( | |
columns={f"{payment_col} mean": "Avg Days", | |
f"{payment_col} median": "Median Days", | |
f"{payment_col} count": "Count"} | |
) | |
category_stats["Avg Days"] = category_stats["Avg Days"].round(1) | |
category_stats["Median Days"] = category_stats["Median Days"].round(1) | |
# Show the statistics table | |
st.write(f"#### Statistics by {selected_category}") | |
st.dataframe(category_stats, use_container_width=True) | |
# Get AI explanation based on category | |
fastest_category = category_stats.loc[category_stats["Avg Days"].idxmin()][selected_category] | |
slowest_category = category_stats.loc[category_stats["Avg Days"].idxmax()][selected_category] | |
avg_diff = category_stats["Avg Days"].max() - category_stats["Avg Days"].min() | |
prompt = f"""Based on this payment data analysis by {selected_category}: | |
- Fastest paying category: {fastest_category} (average {category_stats['Avg Days'].min():.1f} days) | |
- Slowest paying category: {slowest_category} (average {category_stats['Avg Days'].max():.1f} days) | |
- Difference between fastest and slowest: {avg_diff:.1f} days | |
Please explain in simple terms what this tells us about how different {selected_category} categories are paying. | |
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business. | |
""" | |
with st.expander("💡 What does this pattern mean?", expanded=True): | |
explanation = ask_gemini(prompt) | |
st.markdown(explanation) | |
# --- Step 3.3: Customer Groups --- | |
if selected_analysis == "Customer Groups": | |
st.subheader("👥 Customer Payment Behavior Groups") | |
cleaned_df = st.session_state.cleaned_df | |
payment_col = colmap.get('target_col') | |
amount_col = colmap.get('amount') | |
customer_col = colmap.get('customer_id') | |
if not customer_col or not customer_col in cleaned_df.columns: | |
st.warning("Please identify a Customer ID column in the column mapping step to see customer groups.") | |
elif not payment_col or not payment_col in cleaned_df.columns: | |
st.warning("No payment timing column available. Please check your column mappings.") | |
else: | |
# Create customer-level summary | |
st.write("Analyzing customer payment patterns...") | |
# Group by customer and calculate statistics | |
customer_stats = cleaned_df.groupby(customer_col).agg({ | |
payment_col: ['mean', 'median', 'min', 'max', 'count'], | |
}) | |
customer_stats.columns = [' '.join(col).strip() for col in customer_stats.columns.values] | |
customer_stats = customer_stats.reset_index() | |
# Add amount stats if available | |
if amount_col and amount_col in cleaned_df.columns: | |
amount_stats = cleaned_df.groupby(customer_col).agg({ | |
amount_col: ['mean', 'sum'], | |
}) | |
amount_stats.columns = [' '.join(col).strip() for col in amount_stats.columns.values] | |
amount_stats = amount_stats.reset_index() | |
customer_stats = customer_stats.merge(amount_stats, on=customer_col) | |
# Define payment behavior groups based on mean payment time | |
# Only apply if we have payment timing vs due date | |
if colmap.get('payment_timing_vs_due') == payment_col: | |
# Create behavior groups | |
conditions = [ | |
(customer_stats[f"{payment_col} mean"] < -5), # Very early (>5 days before due) | |
(customer_stats[f"{payment_col} mean"] >= -5) & (customer_stats[f"{payment_col} mean"] < 0), # Early (0-5 days before due) | |
(customer_stats[f"{payment_col} mean"] >= 0) & (customer_stats[f"{payment_col} mean"] < 15), # On time to slightly late (0-15 days) | |
(customer_stats[f"{payment_col} mean"] >= 15) & (customer_stats[f"{payment_col} mean"] < 30), # Moderately late (15-30 days) | |
(customer_stats[f"{payment_col} mean"] >= 30) # Very late (>30 days) | |
] | |
values = ['Very Early Payers', 'Early Payers', 'On-time/Slightly Late', 'Moderately Late', 'Very Late Payers'] | |
customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown') | |
else: | |
# If we don't have vs due date, create relative groups | |
median_pay_time = customer_stats[f"{payment_col} mean"].median() | |
conditions = [ | |
(customer_stats[f"{payment_col} mean"] < 0.6 * median_pay_time), # Much faster than median | |
(customer_stats[f"{payment_col} mean"] >= 0.6 * median_pay_time) & (customer_stats[f"{payment_col} mean"] < 0.9 * median_pay_time), # Faster than median | |
(customer_stats[f"{payment_col} mean"] >= 0.9 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.1 * median_pay_time), # Around median | |
(customer_stats[f"{payment_col} mean"] > 1.1 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.5 * median_pay_time), # Slower than median | |
(customer_stats[f"{payment_col} mean"] > 1.5 * median_pay_time) # Much slower than median | |
] | |
values = ['Much Faster Payers', 'Faster Payers', 'Average Payers', 'Slower Payers', 'Much Slower Payers'] | |
customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown') | |
# Count customers in each group | |
behavior_counts = customer_stats['Payment Behavior'].value_counts().reset_index() | |
behavior_counts.columns = ['Payment Behavior', 'Number of Customers'] | |
# Create tabs for different views | |
tab1, tab2 = st.tabs(["Customer Groups", "Individual Customers"]) | |
with tab1: | |
st.write("#### Customer Payment Behavior Groups") | |
# Create a pie chart showing distribution of customer behavior | |
fig_pie = px.pie( | |
behavior_counts, | |
values='Number of Customers', | |
names='Payment Behavior', | |
title="Distribution of Customer Payment Behavior" | |
) | |
st.plotly_chart(fig_pie, use_container_width=True) | |
# Show statistics for each behavior group | |
behavior_group_stats = customer_stats.groupby('Payment Behavior').agg({ | |
f"{payment_col} mean": 'mean', | |
f"{payment_col} count": 'sum', | |
customer_col: 'count' | |
}).reset_index() | |
behavior_group_stats.columns = ['Payment Behavior', 'Avg Days to Payment', 'Total Invoices', 'Customer Count'] | |
behavior_group_stats["Avg Days to Payment"] = behavior_group_stats["Avg Days to Payment"].round(1) | |
# Add amount statistics if available | |
if amount_col and amount_col in cleaned_df.columns and f"{amount_col} sum" in customer_stats.columns: | |
amount_by_behavior = customer_stats.groupby('Payment Behavior')[f"{amount_col} sum"].sum().reset_index() | |
behavior_group_stats = behavior_group_stats.merge(amount_by_behavior, on='Payment Behavior') | |
behavior_group_stats.rename(columns={f"{amount_col} sum": "Total Amount"}, inplace=True) | |
st.dataframe(behavior_group_stats, use_container_width=True) | |
# Get AI explanation | |
largest_group = behavior_counts.loc[behavior_counts['Number of Customers'].idxmax()]['Payment Behavior'] | |
prompt = f"""Based on this customer payment behavior analysis: | |
- Largest customer group: {largest_group} ({behavior_counts['Number of Customers'].max()} customers) | |
- Total customer segments: {len(behavior_counts)} | |
Please explain in simple terms what this tells us about our customer base and their payment habits. | |
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business. | |
""" | |
with st.expander("💡 What does this mean for my business?", expanded=True): | |
explanation = ask_gemini(prompt) | |
st.markdown(explanation) | |
with tab2: | |
st.write("#### Individual Customer Payment Behavior") | |
st.write("Search for specific customers or sort by payment behavior:") | |
# Prepare the customer table | |
customer_display = customer_stats.copy() | |
customer_display = customer_display.rename(columns={ | |
f"{payment_col} mean": "Avg Days to Payment", | |
f"{payment_col} median": "Median Days", | |
f"{payment_col} min": "Min Days", | |
f"{payment_col} max": "Max Days", | |
f"{payment_col} count": "Invoice Count" | |
}) | |
if amount_col and amount_col in cleaned_df.columns: | |
customer_display = customer_display.rename(columns={ | |
f"{amount_col} mean": "Avg Amount", | |
f"{amount_col} sum": "Total Amount" | |
}) | |
# Round numeric columns | |
numeric_cols = customer_display.select_dtypes(include=[np.number]).columns | |
customer_display[numeric_cols] = customer_display[numeric_cols].round(2) | |
# Allow filtering | |
selected_behavior = st.multiselect( | |
"Filter by payment behavior:", | |
options=customer_display['Payment Behavior'].unique(), | |
default=None | |
) | |
if selected_behavior: | |
filtered_customers = customer_display[customer_display['Payment Behavior'].isin(selected_behavior)] | |
else: | |
filtered_customers = customer_display | |
st.dataframe(filtered_customers, use_container_width=True) | |
# --- Step 3.4: Payment Predictions --- | |
if selected_analysis == "Payment Predictions": | |
st.subheader("🔮 Payment Time Prediction Model") | |
cleaned_df = st.session_state.cleaned_df | |
payment_col = colmap.get('target_col') | |
# Check if we have the necessary data | |
if not payment_col or not payment_col in cleaned_df.columns: | |
st.warning("No payment timing column available. Please check your column mappings.") | |
else: | |
st.write("This model helps you predict when customers will pay based on invoice characteristics.") | |
# Identify potential predictor variables | |
numeric_cols = cleaned_df.select_dtypes(include=np.number).columns.tolist() | |
categorical_cols = cleaned_df.select_dtypes(include=['object', 'category']).columns.tolist() | |
# Remove the target variable from predictors | |
if payment_col in numeric_cols: | |
numeric_cols.remove(payment_col) | |
# Prepare predictor variables | |
potential_predictors = [] | |
# Add amount if available | |
amount_col = colmap.get('amount') | |
if amount_col and amount_col in cleaned_df.columns: | |
potential_predictors.append(amount_col) | |
# Add customer_id if available | |
customer_col = colmap.get('customer_id') | |
if customer_col and customer_col in cleaned_df.columns and len(cleaned_df[customer_col].unique()) < 100: | |
potential_predictors.append(customer_col) | |
# Add revenue_type and payment_method if available | |
for key in ['revenue_type', 'payment_method']: | |
col = colmap.get(key) | |
if col and col in cleaned_df.columns: | |
potential_predictors.append(col) | |
# Add other numeric columns that might be useful | |
for col in numeric_cols: | |
if col not in potential_predictors and col != payment_col and 'date' not in col.lower(): | |
potential_predictors.append(col) | |
if len(potential_predictors) < 1: | |
st.warning("Not enough predictor variables available for modeling. Please identify more columns in your data.") | |
else: | |
# Let user select predictor variables | |
st.write("#### Select variables to use for prediction") | |
selected_predictors = st.multiselect( | |
"Choose which factors might influence payment timing:", | |
options=potential_predictors, | |
default=potential_predictors[:min(3, len(potential_predictors))] # Default to first 3 | |
) | |
if len(selected_predictors) < 1: | |
st.warning("Please select at least one predictor variable.") | |
else: | |
# Prepare data for modeling | |
X = cleaned_df[selected_predictors].copy() | |
y = cleaned_df[payment_col].copy() | |
# Process categorical variables | |
X_processed = pd.DataFrame() | |
for col in X.columns: | |
if col in categorical_cols: | |
# One-hot encode categorical variables | |
dummies = pd.get_dummies(X[col], prefix=col, drop_first=True) | |
X_processed = pd.concat([X_processed, dummies], axis=1) | |
else: | |
# Keep numeric columns as is | |
X_processed[col] = X[col] | |
# Check if we have any data after processing | |
if X_processed.shape[1] == 0: | |
st.warning("No usable predictor variables after processing. Please select different variables.") | |
else: | |
# Build and train the model | |
with st.spinner("Training prediction model..."): | |
try: | |
# Fill any remaining missing values with median | |
X_processed = X_processed.fillna(X_processed.median()) | |
# Train a Random Forest model | |
model = RandomForestRegressor(n_estimators=100, random_state=42) | |
model.fit(X_processed, y) | |
# Get feature importances | |
feature_imp = pd.DataFrame({ | |
'Feature': X_processed.columns, | |
'Importance': model.feature_importances_ | |
}).sort_values('Importance', ascending=False) | |
st.success("✅ Payment prediction model trained!") | |
# Show feature importances | |
st.write("#### Factors that influence payment timing") | |
fig_imp = px.bar( | |
feature_imp.head(10), | |
x='Importance', | |
y='Feature', | |
orientation='h', | |
title="Which factors best predict payment timing", | |
labels={'Importance': 'Importance Score'}, | |
) | |
st.plotly_chart(fig_imp, use_container_width=True) | |
# Get AI explanation of feature importance | |
top_features = ", ".join(feature_imp.head(3)['Feature'].tolist()) | |
prompt = f"""Based on this payment prediction model analysis: | |
- Top predictive factors: {top_features} | |
Please explain in simple terms what this tells us about what influences payment timing. | |
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business. | |
""" | |
with st.expander("💡 What influences payment timing?", expanded=True): | |
explanation = ask_gemini(prompt) | |
st.markdown(explanation) | |
# Create prediction interface | |
st.write("#### Predict payment timing for new invoices") | |
st.write("Enter values for a new invoice to predict when it will be paid:") | |
# Create input widgets for each predictor | |
new_invoice_data = {} | |
for col in selected_predictors: | |
if col in categorical_cols: | |
options = cleaned_df[col].unique().tolist() | |
new_invoice_data[col] = st.selectbox(f"Select {col}:", options=options) | |
else: | |
min_val = cleaned_df[col].min() | |
max_val = cleaned_df[col].max() | |
step = (max_val - min_val) / 100 | |
new_invoice_data[col] = st.slider(f"Set {col}:", min_value=float(min_val), max_value=float(max_val), step=float(step)) | |
# Make prediction when button is clicked | |
if st.button("Predict Payment Timing"): | |
# Prepare the input data in the same format as the training data | |
X_new = pd.DataFrame([new_invoice_data]) | |
X_new_processed = pd.DataFrame() | |
# Process the input data the same way as training data | |
for col in X_new.columns: | |
if col in categorical_cols: | |
dummies = pd.get_dummies(X_new[col], prefix=col, drop_first=True) | |
X_new_processed = pd.concat([X_new_processed, dummies], axis=1) | |
else: | |
X_new_processed[col] = X_new[col] | |
# Add missing columns that were in the training data | |
for col in X_processed.columns: | |
if col not in X_new_processed.columns: | |
X_new_processed[col] = 0 | |
# Keep only the columns used during training | |
X_new_processed = X_new_processed[X_processed.columns] | |
# Make prediction | |
prediction = model.predict(X_new_processed)[0] | |
# Display prediction | |
st.success(f"Predicted payment timing: **{prediction:.1f} days**") | |
# Add interpretation if we're predicting vs due date | |
if colmap.get('payment_timing_vs_due') == payment_col: | |
if prediction < 0: | |
st.info(f"This invoice is predicted to be paid **{abs(prediction):.1f} days before** the due date.") | |
elif prediction == 0: | |
st.info("This invoice is predicted to be paid exactly on the due date.") | |
else: | |
st.info(f"This invoice is predicted to be paid **{prediction:.1f} days after** the due date.") | |
except Exception as e: | |
st.error(f"Error building prediction model: {e}") | |
# --- Step 3.5: Summary Report --- | |
if selected_analysis == "Summary Report": | |
st.subheader("📝 Payment Analysis Summary Report") | |
cleaned_df = st.session_state.cleaned_df | |
payment_col = colmap.get('target_col') | |
if not payment_col or not payment_col in cleaned_df.columns: | |
st.warning("No payment timing column available. Please check your column mappings.") | |
else: | |
# Overall payment statistics | |
st.write("### Overall Payment Performance") | |
mean_days = cleaned_df[payment_col].mean() | |
median_days = cleaned_df[payment_col].median() | |
# Calculate on-time percentage if we have vs due date | |
if colmap.get('payment_timing_vs_due') == payment_col: | |
ontime_pct = (cleaned_df[payment_col] <= 0).mean() * 100 | |
late_pct = 100 - ontime_pct | |
# Display metrics in columns | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.metric("Average Days to Payment", f"{mean_days:.1f}") | |
with col2: | |
st.metric("Median Days to Payment", f"{median_days:.1f}") | |
with col3: | |
if colmap.get('payment_timing_vs_due') == payment_col: | |
st.metric("% Paid On Time", f"{ontime_pct:.1f}%") | |
else: | |
st.metric("Total Invoices", f"{len(cleaned_df)}") | |
# Summary visualization | |
st.write("### Payment Timeline") | |
# Create a histogram of payment distribution | |
fig_summary = px.histogram( | |
cleaned_df, | |
x=payment_col, | |
nbins=30, | |
title="Distribution of Payment Timing", | |
color_discrete_sequence=['#3366CC'] | |
) | |
# Add markers for key statistics | |
fig_summary.add_vline(x=mean_days, line_dash="dash", line_color="red", annotation_text=f"Mean: {mean_days:.1f}") | |
fig_summary.add_vline(x=median_days, line_dash="dash", line_color="green", annotation_text=f"Median: {median_days:.1f}") | |
if colmap.get('payment_timing_vs_due') == payment_col: | |
fig_summary.add_vline(x=0, line_dash="solid", line_color="black", annotation_text="Due Date") | |
st.plotly_chart(fig_summary, use_container_width=True) | |
# Generate a summary report with key findings | |
st.write("### Key Findings") | |
# Gather key data points for the AI summary | |
summary_data = { | |
"mean_days": mean_days, | |
"median_days": median_days, | |
"min_days": cleaned_df[payment_col].min(), | |
"max_days": cleaned_df[payment_col].max(), | |
"invoice_count": len(cleaned_df) | |
} | |
# Add on-time percentages if available | |
if colmap.get('payment_timing_vs_due') == payment_col: | |
summary_data["ontime_pct"] = ontime_pct | |
summary_data["late_pct"] = late_pct | |
# Add category information if available | |
category_insights = [] | |
for cat_key in ['revenue_type', 'payment_method']: | |
cat_col = colmap.get(cat_key) | |
if cat_col and cat_col in cleaned_df.columns: | |
# Calculate best and worst categories | |
cat_avg = cleaned_df.groupby(cat_col)[payment_col].mean() | |
if not cat_avg.empty: | |
best_cat = cat_avg.idxmin() | |
worst_cat = cat_avg.idxmax() | |
best_days = cat_avg.min() | |
worst_days = cat_avg.max() | |
category_insights.append(f"Best {cat_key}: {best_cat} ({best_days:.1f} days)") | |
category_insights.append(f"Worst {cat_key}: {worst_cat} ({worst_days:.1f} days)") | |
# Build prompt for AI summary | |
prompt = f"""Based on this payment data analysis: | |
- Average days to payment: {summary_data['mean_days']:.1f} | |
- Median days to payment: {summary_data['median_days']:.1f} | |
- Range: {summary_data['min_days']:.1f} to {summary_data['max_days']:.1f} days | |
- Total invoices analyzed: {summary_data['invoice_count']} | |
""" | |
if 'ontime_pct' in summary_data: | |
prompt += f"- On-time payments: {summary_data['ontime_pct']:.1f}%\n" | |
prompt += f"- Late payments: {summary_data['late_pct']:.1f}%\n" | |
for insight in category_insights: | |
prompt += f"- {insight}\n" | |
prompt += """\nPlease provide a clear, concise 3-4 sentence summary of what this payment data tells us about the business. | |
Focus on the most important insights that would be valuable for financial decision-making. | |
Use simple, non-technical language that anyone in the business could understand. | |
""" | |
with st.expander("💡 Summary of Key Findings", expanded=True): | |
final_summary = ask_gemini(prompt) | |
st.markdown(final_summary) | |
# Recommendations section | |
st.write("### Recommendations") | |
# Build prompt for AI recommendations | |
rec_prompt = prompt + "\n\nBased on this data, please provide 3-4 specific, actionable recommendations to improve payment collection. Each recommendation should be 1-2 sentences and focused on practical actions the business can take." | |
recommendations = ask_gemini(rec_prompt) | |
st.markdown(recommendations) | |