Spaces:

guifav
/

data_analytics

Sleeping

data_analytics / app.py

Guilherme Favaron

Add Streamlit data analytics app

9968889 8 months ago

17.5 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import numpy as np
	import io
	import base64
	import re
	from datetime import datetime

	# Page configuration
	st.set_page_config(
	page_title="Data Analysis Dashboard",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	font-weight: bold;
	color: #1f77b4;
	text-align: center;
	margin-bottom: 2rem;
	}
	.metric-container {
	background-color: #f0f2f6;
	padding: 1rem;
	border-radius: 0.5rem;
	margin: 0.5rem 0;
	}
	.stSelectbox > div > div {
	background-color: white;
	}
	.upload-section {
	border: 2px dashed #cccccc;
	border-radius: 10px;
	padding: 2rem;
	text-align: center;
	margin: 1rem 0;
	}
	</style>
	""", unsafe_allow_html=True)

	def convert_brazilian_number(value):
	"""Convert Brazilian number format (xx.xxx.xxx,xx) to float"""
	if pd.isna(value) or value == '':
	return np.nan

	# Convert to string if not already
	str_value = str(value).strip()

	# Check if it's already a number
	try:
	return float(str_value)
	except ValueError:
	pass

	# Brazilian number pattern: can have dots as thousand separators and comma as decimal
	# Examples: "1.234.567,89", "1.234,56", "1234,56", "1234"
	brazilian_pattern = r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$'

	if re.match(brazilian_pattern, str_value):
	# Remove thousand separators (dots) and replace decimal comma with dot
	converted = str_value.replace('.', '').replace(',', '.')
	try:
	return float(converted)
	except ValueError:
	return np.nan

	return np.nan

	def detect_and_convert_brazilian_numbers(df):
	"""Detect and convert Brazilian number format columns to numeric"""
	converted_columns = []
	df_converted = df.copy()

	for col in df.columns:
	if df[col].dtype == 'object': # Only check string columns
	# Sample some non-null values to check if they look like Brazilian numbers
	sample_values = df[col].dropna().astype(str).head(10)

	if len(sample_values) > 0:
	# Check if most values match Brazilian number pattern
	brazilian_count = 0
	total_count = 0

	for value in sample_values:
	value = str(value).strip()
	if value and value != 'nan':
	total_count += 1
	# Brazilian number pattern
	if re.match(r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$', value) or re.match(r'^-?\d+,\d+$', value):
	brazilian_count += 1

	# If more than 70% of values look like Brazilian numbers, convert the column
	if total_count > 0 and (brazilian_count / total_count) > 0.7:
	converted_series = df[col].apply(convert_brazilian_number)

	# Only convert if we successfully converted most values
	non_null_original = df[col].notna().sum()
	non_null_converted = converted_series.notna().sum()

	if non_null_converted >= (non_null_original * 0.8): # At least 80% conversion success
	df_converted[col] = converted_series
	converted_columns.append(col)

	return df_converted, converted_columns

	def load_sample_data():
	"""Generate sample data for demonstration"""
	np.random.seed(42)
	n_samples = 1000

	data = {
	'Date': pd.date_range('2023-01-01', periods=n_samples, freq='D'),
	'Sales': np.random.normal(1000, 200, n_samples),
	'Profit': np.random.normal(150, 50, n_samples),
	'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], n_samples),
	'Region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
	'Customer_Age': np.random.randint(18, 80, n_samples),
	'Rating': np.random.uniform(1, 5, n_samples)
	}

	df = pd.DataFrame(data)
	df['Sales'] = np.where(df['Sales'] < 0, abs(df['Sales']), df['Sales'])
	df['Profit'] = np.where(df['Category'] == 'Electronics', df['Profit'] * 1.5, df['Profit'])

	# Add some Brazilian formatted numbers for demonstration
	df['Vendas_BR'] = df['Sales'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))
	df['Lucro_BR'] = df['Profit'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))

	return df

	def get_numeric_columns(df):
	"""Get numeric columns from dataframe"""
	return df.select_dtypes(include=[np.number]).columns.tolist()

	def get_categorical_columns(df):
	"""Get categorical columns from dataframe"""
	return df.select_dtypes(include=['object', 'category']).columns.tolist()

	def create_download_link(df, filename="filtered_data.csv"):
	"""Create download link for dataframe"""
	csv = df.to_csv(index=False)
	b64 = base64.b64encode(csv.encode()).decode()
	href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV File</a>'
	return href

	def main():
	# Header
	st.markdown('<h1 class="main-header">📊 Data Analysis Dashboard</h1>', unsafe_allow_html=True)

	# Sidebar
	st.sidebar.title("🔧 Controls")
	st.sidebar.markdown("---")

	# File upload section
	st.sidebar.subheader("📁 Data Upload")
	uploaded_file = st.sidebar.file_uploader(
	"Choose a CSV file",
	type="csv",
	help="Upload a CSV file to analyze your data"
	)

	use_sample = st.sidebar.checkbox(
	"Use Sample Data",
	value=True if uploaded_file is None else False,
	help="Check this to use built-in sample data for demonstration"
	)

	# Brazilian number conversion option
	convert_brazilian = st.sidebar.checkbox(
	"🇧🇷 Auto-convert Brazilian Numbers",
	value=True,
	help="Automatically detect and convert Brazilian number format (xx.xxx.xxx,xx) to numeric"
	)

	# Load data
	try:
	if uploaded_file is not None:
	df = pd.read_csv(uploaded_file)
	st.sidebar.success(f"✅ File uploaded successfully! ({len(df)} rows)")
	elif use_sample:
	df = load_sample_data()
	st.sidebar.info("📋 Using sample data")
	else:
	st.warning("Please upload a CSV file or use sample data to get started.")
	st.markdown("""
	### 🚀 Welcome to the Data Analysis Dashboard!

	This app helps you analyze and visualize your data with:
	- Interactive charts (bar, line, scatter, histogram)
	- Dynamic filtering and data exploration
	- Statistical summaries and insights
	- Export capabilities for data and visualizations
	- 🇧🇷 Brazilian number format support (xx.xxx.xxx,xx)

	To get started:
	1. Upload a CSV file using the sidebar, or
	2. Check "Use Sample Data" to explore with demo data
	""")
	return

	# Apply Brazilian number conversion if enabled
	if convert_brazilian:
	df_original = df.copy()
	df, converted_cols = detect_and_convert_brazilian_numbers(df)

	if converted_cols:
	st.sidebar.success(f"🇧🇷 Converted {len(converted_cols)} columns from Brazilian format: {', '.join(converted_cols)}")

	except Exception as e:
	st.error(f"❌ Error loading file: {str(e)}")
	st.info("Please make sure your file is a valid CSV format.")
	return

	# Data preview section
	st.subheader("📋 Data Preview")

	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric("Total Rows", len(df))
	with col2:
	st.metric("Total Columns", len(df.columns))
	with col3:
	st.metric("Numeric Columns", len(get_numeric_columns(df)))
	with col4:
	st.metric("Text Columns", len(get_categorical_columns(df)))

	# Show data preview
	with st.expander("🔍 View Raw Data", expanded=False):
	st.dataframe(df.head(100), use_container_width=True)

	# Data summary
	with st.expander("📊 Statistical Summary", expanded=False):
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Numeric Columns")
	numeric_cols = get_numeric_columns(df)
	if numeric_cols:
	st.dataframe(df[numeric_cols].describe())
	else:
	st.info("No numeric columns found")

	with col2:
	st.subheader("Categorical Columns")
	cat_cols = get_categorical_columns(df)
	if cat_cols:
	for col in cat_cols[:5]: # Show first 5 categorical columns
	st.write(f"{col}: {df[col].nunique()} unique values")
	if df[col].nunique() <= 10:
	st.write(df[col].value_counts().head())
	else:
	st.info("No categorical columns found")

	# Show conversion info if Brazilian conversion was applied
	if convert_brazilian and 'converted_cols' in locals() and converted_cols:
	with st.expander("🇧🇷 Brazilian Number Conversion Details", expanded=False):
	st.write("Converted Columns:")
	for col in converted_cols:
	original_sample = df_original[col].dropna().head(3).tolist()
	converted_sample = df[col].dropna().head(3).tolist()
	st.write(f"{col}:")
	st.write(f" - Original: {original_sample}")
	st.write(f" - Converted: {converted_sample}")

	# Filtering section
	st.sidebar.markdown("---")
	st.sidebar.subheader("🔍 Data Filters")

	# Create a copy for filtering
	filtered_df = df.copy()

	# Numeric filters
	numeric_cols = get_numeric_columns(df)
	for col in numeric_cols:
	if df[col].dtype in ['int64', 'float64']:
	min_val = float(df[col].min())
	max_val = float(df[col].max())

	if min_val != max_val:
	selected_range = st.sidebar.slider(
	f"{col} Range",
	min_value=min_val,
	max_value=max_val,
	value=(min_val, max_val),
	help=f"Filter data by {col} values"
	)
	filtered_df = filtered_df[
	(filtered_df[col] >= selected_range[0]) &
	(filtered_df[col] <= selected_range[1])
	]

	# Categorical filters
	cat_cols = get_categorical_columns(df)
	for col in cat_cols:
	unique_values = df[col].unique().tolist()
	if len(unique_values) <= 50: # Only show filter for columns with reasonable number of unique values
	selected_values = st.sidebar.multiselect(
	f"Select {col}",
	options=unique_values,
	default=unique_values,
	help=f"Filter data by {col} categories"
	)
	if selected_values:
	filtered_df = filtered_df[filtered_df[col].isin(selected_values)]

	# Show filtered data info
	if len(filtered_df) != len(df):
	st.sidebar.info(f"Filtered: {len(filtered_df)} of {len(df)} rows")

	# Visualization section
	st.markdown("---")
	st.subheader("📈 Data Visualization")

	# Chart type selection
	chart_type = st.selectbox(
	"Select Chart Type",
	["Bar Chart", "Line Chart", "Scatter Plot", "Histogram", "Box Plot"],
	help="Choose the type of visualization"
	)

	col1, col2, col3 = st.columns(3)

	with col1:
	if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]:
	x_column = st.selectbox(
	"X-axis Column",
	options=df.columns.tolist(),
	help="Select column for X-axis"
	)
	else:
	x_column = st.selectbox(
	"Column to Analyze",
	options=numeric_cols,
	help="Select numeric column for histogram"
	)

	with col2:
	if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]:
	y_column = st.selectbox(
	"Y-axis Column",
	options=numeric_cols,
	help="Select numeric column for Y-axis"
	)
	else:
	y_column = None

	with col3:
	if chart_type in ["Bar Chart", "Scatter Plot", "Box Plot"]:
	color_column = st.selectbox(
	"Color/Group By (Optional)",
	options=[None] + cat_cols,
	help="Select column to group/color data"
	)
	else:
	color_column = None

	# Create visualization
	if chart_type == "Bar Chart" and x_column and y_column:
	if x_column in cat_cols:
	# Aggregate data for categorical x-axis
	agg_df = filtered_df.groupby(x_column)[y_column].mean().reset_index()
	fig = px.bar(
	agg_df,
	x=x_column,
	y=y_column,
	title=f"Average {y_column} by {x_column}",
	color=color_column if color_column and color_column in agg_df.columns else None
	)
	else:
	fig = px.bar(
	filtered_df,
	x=x_column,
	y=y_column,
	title=f"{y_column} vs {x_column}",
	color=color_column
	)

	elif chart_type == "Line Chart" and x_column and y_column:
	fig = px.line(
	filtered_df,
	x=x_column,
	y=y_column,
	title=f"{y_column} vs {x_column}",
	color=color_column
	)

	elif chart_type == "Scatter Plot" and x_column and y_column:
	fig = px.scatter(
	filtered_df,
	x=x_column,
	y=y_column,
	title=f"{y_column} vs {x_column}",
	color=color_column,
	size=y_column if y_column in numeric_cols else None
	)

	elif chart_type == "Histogram" and x_column:
	fig = px.histogram(
	filtered_df,
	x=x_column,
	title=f"Distribution of {x_column}",
	nbins=30
	)

	elif chart_type == "Box Plot" and x_column and y_column:
	fig = px.box(
	filtered_df,
	x=x_column,
	y=y_column,
	title=f"{y_column} Distribution by {x_column}",
	color=color_column
	)

	else:
	st.warning("Please select appropriate columns for the chosen chart type.")
	return

	# Update layout for better appearance
	fig.update_layout(
	height=500,
	showlegend=True,
	title_x=0.5,
	font=dict(size=12)
	)

	# Display chart
	st.plotly_chart(fig, use_container_width=True)

	# Download section
	st.markdown("---")
	st.subheader("💾 Download Options")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("Download Filtered Data")
	if st.button("Generate CSV Download Link"):
	download_link = create_download_link(filtered_df, f"filtered_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
	st.markdown(download_link, unsafe_allow_html=True)

	with col2:
	st.markdown("Download Chart")
	if st.button("Download Chart as HTML"):
	html_string = fig.to_html(include_plotlyjs='cdn')
	st.download_button(
	label="Download HTML",
	data=html_string,
	file_name=f"chart_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html",
	mime="text/html"
	)

	# Additional insights
	if len(filtered_df) > 0:
	st.markdown("---")
	st.subheader("🔍 Quick Insights")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("Data Overview")
	st.write(f"• Total records: {len(filtered_df):,}")
	st.write(f"• Columns: {len(filtered_df.columns)}")

	if numeric_cols:
	st.write(f"• Numeric columns: {len(numeric_cols)}")
	for col in numeric_cols[:3]:
	mean_val = filtered_df[col].mean()
	st.write(f" - {col}: avg = {mean_val:.2f}")

	with col2:
	st.markdown("Missing Data")
	missing_data = filtered_df.isnull().sum()
	if missing_data.sum() > 0:
	for col, missing in missing_data.items():
	if missing > 0:
	pct = (missing / len(filtered_df)) * 100
	st.write(f"• {col}: {missing} ({pct:.1f}%)")
	else:
	st.write("✅ No missing data found")

	if __name__ == "__main__":
	main()