Spaces:

noumanjavaid
/

billing-insite

Sleeping

App Files Files Community

billing-insite / app.py

noumanjavaid

Update app.py

50aedaa verified 5 months ago

raw

history blame contribute delete

50.1 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	import google.generativeai as genai
	import os
	import warnings
	from sklearn.ensemble import RandomForestRegressor
	from dotenv import load_dotenv
	load_dotenv()

	# Import utility functions for data conversion
	from utils import safe_convert_numeric, safe_convert_datetime

	# --- Configuration ---
	st.set_page_config(layout="wide", page_title="Payment Analysis Tool")
	warnings.filterwarnings("ignore", category=FutureWarning)

	# --- AI Helper Setup ---
	@st.cache_resource
	def configure_gemini():
	"""Sets up the AI helper to explain patterns in simple terms"""
	try:
	GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
	if not GEMINI_API_KEY:
	st.error("⚠️ AI helper not available (missing API key).")
	return None
	genai.configure(api_key="AIzaSyBAIYRTzVJZyLEcPpIyyc0Ceb4b04WmVY0")
	model = genai.GenerativeModel('gemini-1.5-flash')
	return model
	except Exception as e:
	st.error(f"AI helper not available: {e}")
	return None

	gemini_model = configure_gemini()

	# --- Helper Functions ---
	def ask_gemini(prompt, attempt_limit=3):
	"""Gets easy-to-understand explanations"""
	if not gemini_model:
	return "AI helper not available."

	attempts = 0
	while attempts < attempt_limit:
	try:
	with st.spinner(f"🧠 Getting insights..."):
	response = gemini_model.generate_content(prompt)
	if response.parts:
	return response.text.strip()
	else:
	return "AI helper couldn't provide a response."
	except Exception:
	attempts += 1
	if attempts >= attempt_limit:
	return f"AI helper not responding. Please try again later."

	def find_column_options(df_columns, keywords):
	"""Finds matching columns in your data based on common names"""
	matches = []
	for col in df_columns:
	col_lower = str(col).strip().lower()
	for keyword in keywords:
	if keyword in col_lower:
	matches.append(col)
	break
	return matches

	# --- Data Loading ---
	@st.cache_data
	def load_and_clean_data(uploaded_file):
	"""Loads your data file and gets it ready for analysis"""
	df = None
	filename = uploaded_file.name
	try:
	# Load based on file type
	if filename.lower().endswith('.csv'):
	df = pd.read_csv(uploaded_file)
	elif filename.lower().endswith(('.xlsx', '.xls')):
	try:
	df = pd.read_excel(uploaded_file, engine='openpyxl')
	except Exception:
	try:
	uploaded_file.seek(0)
	df = pd.read_excel(uploaded_file, engine='xlrd')
	except Exception as e_xls:
	st.error(f"Could not load Excel file: {e_xls}")
	return None
	else:
	st.error("Please upload a CSV or Excel file.")
	return None

	st.success(f"✅ Data loaded successfully from '{filename}'!")

	# Clean column names (remove spaces, special characters)
	original_columns = df.columns.tolist()
	df.columns = df.columns.str.strip()
	df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
	df.columns = df.columns.str.lower()
	if original_columns != df.columns.tolist():
	st.toast("Column names cleaned for easier reading")

	# Remove empty columns
	cols_to_drop = [col for col in df.columns if col.startswith('unnamed_') or col == '']
	if cols_to_drop:
	df = df.drop(columns=cols_to_drop)
	st.toast(f"Removed {len(cols_to_drop)} empty column(s)")

	return df

	except Exception as e:
	st.error(f"Error loading file: {e}")
	return None

	# --- Main App ---
	st.title("💰 Payment Analysis Tool")
	st.markdown("""
	This simple tool helps you understand when your customers pay their invoices and why.
	Just upload your payment data and follow the easy steps below.
	""")

	# --- Initialize Session State ---
	if 'data_loaded' not in st.session_state:
	st.session_state.data_loaded = False
	if 'df' not in st.session_state:
	st.session_state.df = None
	if 'column_mappings' not in st.session_state:
	st.session_state.column_mappings = {}
	if 'analysis_step' not in st.session_state:
	st.session_state.analysis_step = 0

	# --- Sidebar ---
	st.sidebar.header("📋 Simple Steps")

	# 1. File Upload
	uploaded_file = st.sidebar.file_uploader("Step 1: Upload Your Data", type=['csv', 'xlsx', 'xls'])

	if uploaded_file and not st.session_state.data_loaded:
	df_loaded = load_and_clean_data(uploaded_file)
	if df_loaded is not None:
	st.session_state.df = df_loaded
	st.session_state.data_loaded = True
	st.session_state.analysis_step = 1 # Move to column mapping
	st.rerun() # Refresh page

	# --- Main Content Area ---
	if not st.session_state.data_loaded:
	st.info("👆 Please upload your payment data file (Excel or CSV) using the sidebar to begin.")
	st.markdown("""
	### What to expect:
	1. Upload your data - We accept Excel or CSV files with your payment information
	2. Identify your columns - Tell us which columns contain what information
	3. View simple charts - See easy-to-understand charts showing payment patterns

	This tool will help you understand:
	* How quickly customers typically pay
	* If certain types of customers pay faster than others
	* Which factors might influence payment timing
	""")
	st.stop() # Stop here if no data

	# Display data preview if loaded
	st.subheader("Your Data Preview (First 5 Rows)")
	st.dataframe(st.session_state.df.head(), use_container_width=True, height=300)

	# 2. Column Mapping (help user identify which columns contain what information)
	if st.session_state.analysis_step >= 1:
	st.sidebar.header("Step 2: Identify Your Columns")

	# These define what columns we need and what keywords might identify them
	COLUMN_KEYWORDS = {
	'customer_id': (['customer id', 'client id'], "Customer ID"),
	'amount': (['amount', 'invoice amount', 'value'], "Invoice Amount"),
	'invoice_date': (['invoice date', 'billing date'], "Invoice Date"),
	'due_date': (['due date', 'payment due'], "Payment Due Date"),
	'receipt_date': (['receipt date', 'payment date'], "Date Payment Received"),
	'payment_timing_vs_due': (['receipt date vs due date', 'days vs due', 'payment delay due'], "Days Early/Late vs Due Date"),
	'payment_timing_vs_invoice': (['receipt date vs invoice date', 'days vs invoice', 'payment delay invoice'], "Days Since Invoice Date"),
	'revenue_type': (['revenue type', 'product type', 'service type'], "Product/Service Type"),
	'payment_method': (['payment method', 'payment type', 'payment mode'], "Payment Method"),
	}

	mapping_complete = True
	cols_list = [""] + st.session_state.df.columns.tolist() # Add empty option

	with st.sidebar.expander("Match your columns", expanded=(st.session_state.analysis_step == 1)):
	st.markdown("Tell us which columns in your data contain what information")
	st.markdown("We'll try to guess, but please check if our guesses are correct:")

	for key, (keywords, description) in COLUMN_KEYWORDS.items():
	# Try to find matching columns automatically
	options = find_column_options(st.session_state.df.columns, keywords)
	default_index = 0
	if len(options) == 1:
	try:
	default_index = cols_list.index(options[0])
	except ValueError:
	default_index = 0
	elif len(options) > 1:
	st.info(f"Multiple options found for '{description}'. Please select the correct one.")

	# Let user confirm or select the column
	selected_col = st.selectbox(
	f"Which column has: {description}?",
	options=cols_list,
	index=default_index,
	key=f"map_{key}"
	)
	if selected_col:
	st.session_state.column_mappings[key] = selected_col
	else:
	# Check if column is absolutely necessary
	if key in ['amount', 'payment_timing_vs_invoice', 'payment_timing_vs_due']:
	st.warning(f"⚠️ We need '{description}' for analysis!")
	mapping_complete = False
	else:
	st.session_state.column_mappings[key] = None # Mark as not available

	if st.sidebar.button("Confirm Columns", key="confirm_map_button", disabled=(not mapping_complete)):
	# Select which payment timing column to use
	col_pay_vs_inv = st.session_state.column_mappings.get('payment_timing_vs_invoice')
	col_pay_vs_due = st.session_state.column_mappings.get('payment_timing_vs_due')
	target_col = col_pay_vs_inv if col_pay_vs_inv else col_pay_vs_due

	if not target_col:
	st.sidebar.error("❌ We need at least one payment timing column to proceed.")
	else:
	st.session_state.column_mappings['target_col'] = target_col # Store the chosen target
	st.session_state.analysis_step = 2 # Move to next step
	st.sidebar.success("✅ Column mapping confirmed!")
	st.rerun()
	elif not mapping_complete and st.session_state.analysis_step == 1:
	st.sidebar.warning("Please select all required columns (marked ⚠️) before continuing.")


	# 3. Analysis Steps
	if st.session_state.analysis_step >= 2:
	st.sidebar.header("Step 3: View Insights")
	analysis_options = [
	"Payment Timing Overview",
	"Payment Patterns by Category",
	"Customer Groups",
	"Payment Predictions",
	"Summary Report"
	]
	selected_analysis = st.sidebar.radio("What would you like to see?", analysis_options, key="analysis_radio")

	# --- Get Mapped Columns ---
	colmap = st.session_state.column_mappings
	target_col = colmap.get('target_col') # The column tracking payment timing

	# --- Execute Selected Analysis ---
	df_processed = st.session_state.df.copy() # Work with a copy of the data

	# --- Step 3.1: Payment Timing Overview ---
	if selected_analysis == "Payment Timing Overview" or st.session_state.analysis_step > 2:
	if 'cleaned_df' not in st.session_state or selected_analysis == "Payment Timing Overview":
	st.subheader("Preparing Your Data")
	with st.spinner("Getting your data ready..."):
	# Convert columns to the right data types
	numeric_cols_keys = ['amount', 'payment_timing_vs_due', 'payment_timing_vs_invoice']
	date_cols_keys = ['invoice_date', 'due_date', 'receipt_date']

	for key in numeric_cols_keys:
	col = colmap.get(key)
	if col and col in df_processed.columns:
	df_processed[col] = safe_convert_numeric(df_processed[col])

	for key in date_cols_keys:
	col = colmap.get(key)
	if col and col in df_processed.columns:
	df_processed[col] = safe_convert_datetime(df_processed[col])

	# Handle missing values
	st.write("Fixing missing information...")
	# Remove columns with too much missing data
	missing_percentages = df_processed.isnull().sum() / len(df_processed) * 100
	threshold = 50
	high_missing_cols = missing_percentages[missing_percentages > threshold].index
	if not high_missing_cols.empty:
	df_processed = df_processed.drop(columns=high_missing_cols)
	st.write(f"- Removed columns with too much missing data")

	# Fill in other missing values with reasonable estimates
	numerical_cols = df_processed.select_dtypes(include=np.number).columns
	for col in numerical_cols:
	if df_processed[col].isnull().any():
	median_val = df_processed[col].median()
	df_processed[col] = df_processed[col].fillna(median_val)
	st.write("- Filled in missing numbers with typical values")

	# Fill in missing categories
	categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns
	for col in categorical_cols:
	if df_processed[col].isnull().any():
	try:
	mode_val = df_processed[col].mode()[0]
	df_processed[col] = df_processed[col].fillna(mode_val)
	except IndexError:
	df_processed[col] = df_processed[col].fillna('Unknown')
	st.write("- Filled in missing categories with most common values")

	# Store cleaned data for future use
	st.session_state.cleaned_df = df_processed
	st.session_state.analysis_step = max(st.session_state.analysis_step, 3)
	st.success("✅ Data prepared successfully!")

	# Now show the payment timing overview
	if selected_analysis == "Payment Timing Overview":
	st.subheader("📊 Payment Timing Overview")
	cleaned_df = st.session_state.cleaned_df
	payment_col = colmap.get('target_col')

	if payment_col and payment_col in cleaned_df.columns:
	# Create a layout with columns
	col1, col2 = st.columns(2)

	with col1:
	# Create a histogram showing payment timing distribution
	st.write("#### Distribution of Payment Timing")
	fig_hist = px.histogram(
	cleaned_df,
	x=payment_col,
	nbins=20,
	title=f"How quickly payments are made",
	labels={payment_col: "Days to Payment"},
	color_discrete_sequence=['#3366CC'],
	)
	fig_hist.update_layout(
	xaxis_title="Days to Payment",
	yaxis_title="Number of Invoices",
	showlegend=False
	)

	# Add a vertical line for average
	mean_days = cleaned_df[payment_col].mean()
	fig_hist.add_vline(
	x=mean_days,
	line_dash="dash",
	line_color="red",
	annotation_text=f"Average: {mean_days:.1f} days",
	annotation_position="top right"
	)

	# Add a vertical line for on-time (0 days)
	if colmap.get('payment_timing_vs_due'):
	fig_hist.add_vline(
	x=0,
	line_dash="dash",
	line_color="green",
	annotation_text="Due Date",
	annotation_position="top left"
	)

	st.plotly_chart(fig_hist, use_container_width=True)

	with col2:
	# Create a box plot showing the spread of payment timings
	st.write("#### Payment Timing Statistics")
	fig_box = px.box(
	cleaned_df,
	y=payment_col,
	title="Range of Payment Timings",
	points="all",
	labels={payment_col: "Days to Payment"},
	color_discrete_sequence=['#3366CC'],
	)
	fig_box.update_layout(
	yaxis_title="Days to Payment",
	showlegend=False
	)
	st.plotly_chart(fig_box, use_container_width=True)

	# Payment timing stats
	st.write("#### Key Payment Statistics")
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric("Average Days", f"{cleaned_df[payment_col].mean():.1f}")
	with col2:
	st.metric("Median Days", f"{cleaned_df[payment_col].median():.1f}")
	with col3:
	early_percent = (cleaned_df[payment_col] <= 0).mean() * 100 if 'payment_timing_vs_due' in colmap.values() else None
	if early_percent is not None:
	st.metric("% Paid On Time", f"{early_percent:.1f}%")
	else:
	st.metric("Minimum Days", f"{cleaned_df[payment_col].min():.1f}")
	with col4:
	st.metric("Maximum Days", f"{cleaned_df[payment_col].max():.1f}")

	# Get AI explanation if payment_col is days vs due date
	if colmap.get('payment_timing_vs_due') == payment_col:
	# Ask AI for explanation of payment patterns
	prompt = f"""Based on this payment data summary:
	- Average days to payment: {cleaned_df[payment_col].mean():.1f}
	- Median days to payment: {cleaned_df[payment_col].median():.1f}
	- % paid on time: {early_percent:.1f}%
	- Maximum days late: {cleaned_df[payment_col].max():.1f}

	Please explain in simple terms what this tells us about how customers are paying their invoices.
	Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
	"""

	with st.expander("💡 What does this mean for my business?", expanded=True):
	explanation = ask_gemini(prompt)
	st.markdown(explanation)

	else:
	st.error("No payment timing column available. Please check your column mappings.")

	# --- Step 3.2: Payment Patterns by Category ---
	if selected_analysis == "Payment Patterns by Category":
	st.subheader("📊 Payment Patterns by Category")
	cleaned_df = st.session_state.cleaned_df
	payment_col = colmap.get('target_col')

	# Define possible category columns and let user select which to analyze
	category_cols = []
	for key in ['revenue_type', 'payment_method', 'customer_id']:
	col = colmap.get(key)
	if col and col in cleaned_df.columns:
	category_cols.append(col)

	if not category_cols:
	st.warning("No category columns were identified. Please go back to column mapping and identify at least one of: Revenue Type, Payment Method, or Customer ID.")
	else:
	# Let user select which category to analyze
	selected_category = st.selectbox(
	"Select category to analyze:",
	options=category_cols,
	key="category_selector"
	)

	if selected_category and payment_col and payment_col in cleaned_df.columns:
	# Limit categories to top 10 by frequency to avoid cluttered charts
	top_categories = cleaned_df[selected_category].value_counts().nlargest(10).index
	filtered_df = cleaned_df[cleaned_df[selected_category].isin(top_categories)]

	# Create layout with columns
	col1, col2 = st.columns(2)

	with col1:
	# Box plot showing payment timing by category
	st.write(f"#### Payment Timing by {selected_category}")
	fig_category_box = px.box(
	filtered_df,
	x=selected_category,
	y=payment_col,
	color=selected_category,
	title=f"How different {selected_category} categories pay",
	labels={payment_col: "Days to Payment"},
	)
	fig_category_box.update_layout(
	xaxis_title=selected_category,
	yaxis_title="Days to Payment",
	xaxis={'categoryorder':'total descending'}
	)
	st.plotly_chart(fig_category_box, use_container_width=True)

	with col2:
	# Bar chart showing average payment time by category
	st.write(f"#### Average Payment Time by {selected_category}")
	category_avg = filtered_df.groupby(selected_category)[payment_col].mean().reset_index()
	category_avg = category_avg.sort_values(payment_col)

	fig_category_bar = px.bar(
	category_avg,
	x=selected_category,
	y=payment_col,
	color=selected_category,
	title=f"Average days to payment by {selected_category}",
	labels={payment_col: "Average Days to Payment"},
	)
	fig_category_bar.update_layout(
	xaxis_title=selected_category,
	yaxis_title="Average Days to Payment",
	showlegend=False
	)
	st.plotly_chart(fig_category_bar, use_container_width=True)

	# Calculate statistics by category
	category_stats = filtered_df.groupby(selected_category).agg({
	payment_col: ['mean', 'median', 'count'],
	})
	category_stats.columns = [' '.join(col).strip() for col in category_stats.columns.values]
	category_stats = category_stats.reset_index().rename(
	columns={f"{payment_col} mean": "Avg Days",
	f"{payment_col} median": "Median Days",
	f"{payment_col} count": "Count"}
	)
	category_stats["Avg Days"] = category_stats["Avg Days"].round(1)
	category_stats["Median Days"] = category_stats["Median Days"].round(1)

	# Show the statistics table
	st.write(f"#### Statistics by {selected_category}")
	st.dataframe(category_stats, use_container_width=True)

	# Get AI explanation based on category
	fastest_category = category_stats.loc[category_stats["Avg Days"].idxmin()][selected_category]
	slowest_category = category_stats.loc[category_stats["Avg Days"].idxmax()][selected_category]
	avg_diff = category_stats["Avg Days"].max() - category_stats["Avg Days"].min()

	prompt = f"""Based on this payment data analysis by {selected_category}:
	- Fastest paying category: {fastest_category} (average {category_stats['Avg Days'].min():.1f} days)
	- Slowest paying category: {slowest_category} (average {category_stats['Avg Days'].max():.1f} days)
	- Difference between fastest and slowest: {avg_diff:.1f} days

	Please explain in simple terms what this tells us about how different {selected_category} categories are paying.
	Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
	"""

	with st.expander("💡 What does this pattern mean?", expanded=True):
	explanation = ask_gemini(prompt)
	st.markdown(explanation)

	# --- Step 3.3: Customer Groups ---
	if selected_analysis == "Customer Groups":
	st.subheader("👥 Customer Payment Behavior Groups")
	cleaned_df = st.session_state.cleaned_df
	payment_col = colmap.get('target_col')
	amount_col = colmap.get('amount')
	customer_col = colmap.get('customer_id')

	if not customer_col or not customer_col in cleaned_df.columns:
	st.warning("Please identify a Customer ID column in the column mapping step to see customer groups.")
	elif not payment_col or not payment_col in cleaned_df.columns:
	st.warning("No payment timing column available. Please check your column mappings.")
	else:
	# Create customer-level summary
	st.write("Analyzing customer payment patterns...")

	# Group by customer and calculate statistics
	customer_stats = cleaned_df.groupby(customer_col).agg({
	payment_col: ['mean', 'median', 'min', 'max', 'count'],
	})
	customer_stats.columns = [' '.join(col).strip() for col in customer_stats.columns.values]
	customer_stats = customer_stats.reset_index()

	# Add amount stats if available
	if amount_col and amount_col in cleaned_df.columns:
	amount_stats = cleaned_df.groupby(customer_col).agg({
	amount_col: ['mean', 'sum'],
	})
	amount_stats.columns = [' '.join(col).strip() for col in amount_stats.columns.values]
	amount_stats = amount_stats.reset_index()
	customer_stats = customer_stats.merge(amount_stats, on=customer_col)

	# Define payment behavior groups based on mean payment time
	# Only apply if we have payment timing vs due date
	if colmap.get('payment_timing_vs_due') == payment_col:
	# Create behavior groups
	conditions = [
	(customer_stats[f"{payment_col} mean"] < -5), # Very early (>5 days before due)
	(customer_stats[f"{payment_col} mean"] >= -5) & (customer_stats[f"{payment_col} mean"] < 0), # Early (0-5 days before due)
	(customer_stats[f"{payment_col} mean"] >= 0) & (customer_stats[f"{payment_col} mean"] < 15), # On time to slightly late (0-15 days)
	(customer_stats[f"{payment_col} mean"] >= 15) & (customer_stats[f"{payment_col} mean"] < 30), # Moderately late (15-30 days)
	(customer_stats[f"{payment_col} mean"] >= 30) # Very late (>30 days)
	]
	values = ['Very Early Payers', 'Early Payers', 'On-time/Slightly Late', 'Moderately Late', 'Very Late Payers']
	customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
	else:
	# If we don't have vs due date, create relative groups
	median_pay_time = customer_stats[f"{payment_col} mean"].median()
	conditions = [
	(customer_stats[f"{payment_col} mean"] < 0.6 * median_pay_time), # Much faster than median
	(customer_stats[f"{payment_col} mean"] >= 0.6 * median_pay_time) & (customer_stats[f"{payment_col} mean"] < 0.9 * median_pay_time), # Faster than median
	(customer_stats[f"{payment_col} mean"] >= 0.9 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.1 * median_pay_time), # Around median
	(customer_stats[f"{payment_col} mean"] > 1.1 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.5 * median_pay_time), # Slower than median
	(customer_stats[f"{payment_col} mean"] > 1.5 * median_pay_time) # Much slower than median
	]
	values = ['Much Faster Payers', 'Faster Payers', 'Average Payers', 'Slower Payers', 'Much Slower Payers']
	customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')

	# Count customers in each group
	behavior_counts = customer_stats['Payment Behavior'].value_counts().reset_index()
	behavior_counts.columns = ['Payment Behavior', 'Number of Customers']

	# Create tabs for different views
	tab1, tab2 = st.tabs(["Customer Groups", "Individual Customers"])

	with tab1:
	st.write("#### Customer Payment Behavior Groups")

	# Create a pie chart showing distribution of customer behavior
	fig_pie = px.pie(
	behavior_counts,
	values='Number of Customers',
	names='Payment Behavior',
	title="Distribution of Customer Payment Behavior"
	)
	st.plotly_chart(fig_pie, use_container_width=True)

	# Show statistics for each behavior group
	behavior_group_stats = customer_stats.groupby('Payment Behavior').agg({
	f"{payment_col} mean": 'mean',
	f"{payment_col} count": 'sum',
	customer_col: 'count'
	}).reset_index()

	behavior_group_stats.columns = ['Payment Behavior', 'Avg Days to Payment', 'Total Invoices', 'Customer Count']
	behavior_group_stats["Avg Days to Payment"] = behavior_group_stats["Avg Days to Payment"].round(1)

	# Add amount statistics if available
	if amount_col and amount_col in cleaned_df.columns and f"{amount_col} sum" in customer_stats.columns:
	amount_by_behavior = customer_stats.groupby('Payment Behavior')[f"{amount_col} sum"].sum().reset_index()
	behavior_group_stats = behavior_group_stats.merge(amount_by_behavior, on='Payment Behavior')
	behavior_group_stats.rename(columns={f"{amount_col} sum": "Total Amount"}, inplace=True)

	st.dataframe(behavior_group_stats, use_container_width=True)

	# Get AI explanation
	largest_group = behavior_counts.loc[behavior_counts['Number of Customers'].idxmax()]['Payment Behavior']
	prompt = f"""Based on this customer payment behavior analysis:
	- Largest customer group: {largest_group} ({behavior_counts['Number of Customers'].max()} customers)
	- Total customer segments: {len(behavior_counts)}

	Please explain in simple terms what this tells us about our customer base and their payment habits.
	Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
	"""

	with st.expander("💡 What does this mean for my business?", expanded=True):
	explanation = ask_gemini(prompt)
	st.markdown(explanation)

	with tab2:
	st.write("#### Individual Customer Payment Behavior")
	st.write("Search for specific customers or sort by payment behavior:")

	# Prepare the customer table
	customer_display = customer_stats.copy()
	customer_display = customer_display.rename(columns={
	f"{payment_col} mean": "Avg Days to Payment",
	f"{payment_col} median": "Median Days",
	f"{payment_col} min": "Min Days",
	f"{payment_col} max": "Max Days",
	f"{payment_col} count": "Invoice Count"
	})

	if amount_col and amount_col in cleaned_df.columns:
	customer_display = customer_display.rename(columns={
	f"{amount_col} mean": "Avg Amount",
	f"{amount_col} sum": "Total Amount"
	})

	# Round numeric columns
	numeric_cols = customer_display.select_dtypes(include=[np.number]).columns
	customer_display[numeric_cols] = customer_display[numeric_cols].round(2)

	# Allow filtering
	selected_behavior = st.multiselect(
	"Filter by payment behavior:",
	options=customer_display['Payment Behavior'].unique(),
	default=None
	)

	if selected_behavior:
	filtered_customers = customer_display[customer_display['Payment Behavior'].isin(selected_behavior)]
	else:
	filtered_customers = customer_display

	st.dataframe(filtered_customers, use_container_width=True)

	# --- Step 3.4: Payment Predictions ---
	if selected_analysis == "Payment Predictions":
	st.subheader("🔮 Payment Time Prediction Model")
	cleaned_df = st.session_state.cleaned_df
	payment_col = colmap.get('target_col')

	# Check if we have the necessary data
	if not payment_col or not payment_col in cleaned_df.columns:
	st.warning("No payment timing column available. Please check your column mappings.")
	else:
	st.write("This model helps you predict when customers will pay based on invoice characteristics.")

	# Identify potential predictor variables
	numeric_cols = cleaned_df.select_dtypes(include=np.number).columns.tolist()
	categorical_cols = cleaned_df.select_dtypes(include=['object', 'category']).columns.tolist()

	# Remove the target variable from predictors
	if payment_col in numeric_cols:
	numeric_cols.remove(payment_col)

	# Prepare predictor variables
	potential_predictors = []

	# Add amount if available
	amount_col = colmap.get('amount')
	if amount_col and amount_col in cleaned_df.columns:
	potential_predictors.append(amount_col)

	# Add customer_id if available
	customer_col = colmap.get('customer_id')
	if customer_col and customer_col in cleaned_df.columns and len(cleaned_df[customer_col].unique()) < 100:
	potential_predictors.append(customer_col)

	# Add revenue_type and payment_method if available
	for key in ['revenue_type', 'payment_method']:
	col = colmap.get(key)
	if col and col in cleaned_df.columns:
	potential_predictors.append(col)

	# Add other numeric columns that might be useful
	for col in numeric_cols:
	if col not in potential_predictors and col != payment_col and 'date' not in col.lower():
	potential_predictors.append(col)

	if len(potential_predictors) < 1:
	st.warning("Not enough predictor variables available for modeling. Please identify more columns in your data.")
	else:
	# Let user select predictor variables
	st.write("#### Select variables to use for prediction")
	selected_predictors = st.multiselect(
	"Choose which factors might influence payment timing:",
	options=potential_predictors,
	default=potential_predictors[:min(3, len(potential_predictors))] # Default to first 3
	)

	if len(selected_predictors) < 1:
	st.warning("Please select at least one predictor variable.")
	else:
	# Prepare data for modeling
	X = cleaned_df[selected_predictors].copy()
	y = cleaned_df[payment_col].copy()

	# Process categorical variables
	X_processed = pd.DataFrame()

	for col in X.columns:
	if col in categorical_cols:
	# One-hot encode categorical variables
	dummies = pd.get_dummies(X[col], prefix=col, drop_first=True)
	X_processed = pd.concat([X_processed, dummies], axis=1)
	else:
	# Keep numeric columns as is
	X_processed[col] = X[col]

	# Check if we have any data after processing
	if X_processed.shape[1] == 0:
	st.warning("No usable predictor variables after processing. Please select different variables.")
	else:
	# Build and train the model
	with st.spinner("Training prediction model..."):
	try:
	# Fill any remaining missing values with median
	X_processed = X_processed.fillna(X_processed.median())

	# Train a Random Forest model
	model = RandomForestRegressor(n_estimators=100, random_state=42)
	model.fit(X_processed, y)

	# Get feature importances
	feature_imp = pd.DataFrame({
	'Feature': X_processed.columns,
	'Importance': model.feature_importances_
	}).sort_values('Importance', ascending=False)

	st.success("✅ Payment prediction model trained!")

	# Show feature importances
	st.write("#### Factors that influence payment timing")
	fig_imp = px.bar(
	feature_imp.head(10),
	x='Importance',
	y='Feature',
	orientation='h',
	title="Which factors best predict payment timing",
	labels={'Importance': 'Importance Score'},
	)
	st.plotly_chart(fig_imp, use_container_width=True)

	# Get AI explanation of feature importance
	top_features = ", ".join(feature_imp.head(3)['Feature'].tolist())
	prompt = f"""Based on this payment prediction model analysis:
	- Top predictive factors: {top_features}

	Please explain in simple terms what this tells us about what influences payment timing.
	Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
	"""

	with st.expander("💡 What influences payment timing?", expanded=True):
	explanation = ask_gemini(prompt)
	st.markdown(explanation)

	# Create prediction interface
	st.write("#### Predict payment timing for new invoices")
	st.write("Enter values for a new invoice to predict when it will be paid:")

	# Create input widgets for each predictor
	new_invoice_data = {}
	for col in selected_predictors:
	if col in categorical_cols:
	options = cleaned_df[col].unique().tolist()
	new_invoice_data[col] = st.selectbox(f"Select {col}:", options=options)
	else:
	min_val = cleaned_df[col].min()
	max_val = cleaned_df[col].max()
	step = (max_val - min_val) / 100
	new_invoice_data[col] = st.slider(f"Set {col}:", min_value=float(min_val), max_value=float(max_val), step=float(step))

	# Make prediction when button is clicked
	if st.button("Predict Payment Timing"):
	# Prepare the input data in the same format as the training data
	X_new = pd.DataFrame([new_invoice_data])
	X_new_processed = pd.DataFrame()

	# Process the input data the same way as training data
	for col in X_new.columns:
	if col in categorical_cols:
	dummies = pd.get_dummies(X_new[col], prefix=col, drop_first=True)
	X_new_processed = pd.concat([X_new_processed, dummies], axis=1)
	else:
	X_new_processed[col] = X_new[col]

	# Add missing columns that were in the training data
	for col in X_processed.columns:
	if col not in X_new_processed.columns:
	X_new_processed[col] = 0

	# Keep only the columns used during training
	X_new_processed = X_new_processed[X_processed.columns]

	# Make prediction
	prediction = model.predict(X_new_processed)[0]

	# Display prediction
	st.success(f"Predicted payment timing: {prediction:.1f} days")

	# Add interpretation if we're predicting vs due date
	if colmap.get('payment_timing_vs_due') == payment_col:
	if prediction < 0:
	st.info(f"This invoice is predicted to be paid {abs(prediction):.1f} days before the due date.")
	elif prediction == 0:
	st.info("This invoice is predicted to be paid exactly on the due date.")
	else:
	st.info(f"This invoice is predicted to be paid {prediction:.1f} days after the due date.")
	except Exception as e:
	st.error(f"Error building prediction model: {e}")

	# --- Step 3.5: Summary Report ---
	if selected_analysis == "Summary Report":
	st.subheader("📝 Payment Analysis Summary Report")
	cleaned_df = st.session_state.cleaned_df
	payment_col = colmap.get('target_col')

	if not payment_col or not payment_col in cleaned_df.columns:
	st.warning("No payment timing column available. Please check your column mappings.")
	else:
	# Overall payment statistics
	st.write("### Overall Payment Performance")
	mean_days = cleaned_df[payment_col].mean()
	median_days = cleaned_df[payment_col].median()

	# Calculate on-time percentage if we have vs due date
	if colmap.get('payment_timing_vs_due') == payment_col:
	ontime_pct = (cleaned_df[payment_col] <= 0).mean() * 100
	late_pct = 100 - ontime_pct

	# Display metrics in columns
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Average Days to Payment", f"{mean_days:.1f}")
	with col2:
	st.metric("Median Days to Payment", f"{median_days:.1f}")
	with col3:
	if colmap.get('payment_timing_vs_due') == payment_col:
	st.metric("% Paid On Time", f"{ontime_pct:.1f}%")
	else:
	st.metric("Total Invoices", f"{len(cleaned_df)}")

	# Summary visualization
	st.write("### Payment Timeline")

	# Create a histogram of payment distribution
	fig_summary = px.histogram(
	cleaned_df,
	x=payment_col,
	nbins=30,
	title="Distribution of Payment Timing",
	color_discrete_sequence=['#3366CC']
	)

	# Add markers for key statistics
	fig_summary.add_vline(x=mean_days, line_dash="dash", line_color="red", annotation_text=f"Mean: {mean_days:.1f}")
	fig_summary.add_vline(x=median_days, line_dash="dash", line_color="green", annotation_text=f"Median: {median_days:.1f}")

	if colmap.get('payment_timing_vs_due') == payment_col:
	fig_summary.add_vline(x=0, line_dash="solid", line_color="black", annotation_text="Due Date")

	st.plotly_chart(fig_summary, use_container_width=True)

	# Generate a summary report with key findings
	st.write("### Key Findings")

	# Gather key data points for the AI summary
	summary_data = {
	"mean_days": mean_days,
	"median_days": median_days,
	"min_days": cleaned_df[payment_col].min(),
	"max_days": cleaned_df[payment_col].max(),
	"invoice_count": len(cleaned_df)
	}

	# Add on-time percentages if available
	if colmap.get('payment_timing_vs_due') == payment_col:
	summary_data["ontime_pct"] = ontime_pct
	summary_data["late_pct"] = late_pct

	# Add category information if available
	category_insights = []
	for cat_key in ['revenue_type', 'payment_method']:
	cat_col = colmap.get(cat_key)
	if cat_col and cat_col in cleaned_df.columns:
	# Calculate best and worst categories
	cat_avg = cleaned_df.groupby(cat_col)[payment_col].mean()
	if not cat_avg.empty:
	best_cat = cat_avg.idxmin()
	worst_cat = cat_avg.idxmax()
	best_days = cat_avg.min()
	worst_days = cat_avg.max()

	category_insights.append(f"Best {cat_key}: {best_cat} ({best_days:.1f} days)")
	category_insights.append(f"Worst {cat_key}: {worst_cat} ({worst_days:.1f} days)")

	# Build prompt for AI summary
	prompt = f"""Based on this payment data analysis:
	- Average days to payment: {summary_data['mean_days']:.1f}
	- Median days to payment: {summary_data['median_days']:.1f}
	- Range: {summary_data['min_days']:.1f} to {summary_data['max_days']:.1f} days
	- Total invoices analyzed: {summary_data['invoice_count']}
	"""

	if 'ontime_pct' in summary_data:
	prompt += f"- On-time payments: {summary_data['ontime_pct']:.1f}%\n"
	prompt += f"- Late payments: {summary_data['late_pct']:.1f}%\n"

	for insight in category_insights:
	prompt += f"- {insight}\n"

	prompt += """\nPlease provide a clear, concise 3-4 sentence summary of what this payment data tells us about the business.
	Focus on the most important insights that would be valuable for financial decision-making.
	Use simple, non-technical language that anyone in the business could understand.
	"""

	with st.expander("💡 Summary of Key Findings", expanded=True):
	final_summary = ask_gemini(prompt)
	st.markdown(final_summary)

	# Recommendations section
	st.write("### Recommendations")

	# Build prompt for AI recommendations
	rec_prompt = prompt + "\n\nBased on this data, please provide 3-4 specific, actionable recommendations to improve payment collection. Each recommendation should be 1-2 sentences and focused on practical actions the business can take."

	recommendations = ask_gemini(rec_prompt)
	st.markdown(recommendations)