Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import numpy as np | |
| import io | |
| import base64 | |
| import re | |
| from datetime import datetime | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="Data Analysis Dashboard", | |
| page_icon="📊", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS for better styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.5rem; | |
| font-weight: bold; | |
| color: #1f77b4; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| } | |
| .metric-container { | |
| background-color: #f0f2f6; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| margin: 0.5rem 0; | |
| } | |
| .stSelectbox > div > div { | |
| background-color: white; | |
| } | |
| .upload-section { | |
| border: 2px dashed #cccccc; | |
| border-radius: 10px; | |
| padding: 2rem; | |
| text-align: center; | |
| margin: 1rem 0; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def convert_brazilian_number(value): | |
| """Convert Brazilian number format (xx.xxx.xxx,xx) to float""" | |
| if pd.isna(value) or value == '': | |
| return np.nan | |
| # Convert to string if not already | |
| str_value = str(value).strip() | |
| # Check if it's already a number | |
| try: | |
| return float(str_value) | |
| except ValueError: | |
| pass | |
| # Brazilian number pattern: can have dots as thousand separators and comma as decimal | |
| # Examples: "1.234.567,89", "1.234,56", "1234,56", "1234" | |
| brazilian_pattern = r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$' | |
| if re.match(brazilian_pattern, str_value): | |
| # Remove thousand separators (dots) and replace decimal comma with dot | |
| converted = str_value.replace('.', '').replace(',', '.') | |
| try: | |
| return float(converted) | |
| except ValueError: | |
| return np.nan | |
| return np.nan | |
| def detect_and_convert_brazilian_numbers(df): | |
| """Detect and convert Brazilian number format columns to numeric""" | |
| converted_columns = [] | |
| df_converted = df.copy() | |
| for col in df.columns: | |
| if df[col].dtype == 'object': # Only check string columns | |
| # Sample some non-null values to check if they look like Brazilian numbers | |
| sample_values = df[col].dropna().astype(str).head(10) | |
| if len(sample_values) > 0: | |
| # Check if most values match Brazilian number pattern | |
| brazilian_count = 0 | |
| total_count = 0 | |
| for value in sample_values: | |
| value = str(value).strip() | |
| if value and value != 'nan': | |
| total_count += 1 | |
| # Brazilian number pattern | |
| if re.match(r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$', value) or re.match(r'^-?\d+,\d+$', value): | |
| brazilian_count += 1 | |
| # If more than 70% of values look like Brazilian numbers, convert the column | |
| if total_count > 0 and (brazilian_count / total_count) > 0.7: | |
| converted_series = df[col].apply(convert_brazilian_number) | |
| # Only convert if we successfully converted most values | |
| non_null_original = df[col].notna().sum() | |
| non_null_converted = converted_series.notna().sum() | |
| if non_null_converted >= (non_null_original * 0.8): # At least 80% conversion success | |
| df_converted[col] = converted_series | |
| converted_columns.append(col) | |
| return df_converted, converted_columns | |
| def load_sample_data(): | |
| """Generate sample data for demonstration""" | |
| np.random.seed(42) | |
| n_samples = 1000 | |
| data = { | |
| 'Date': pd.date_range('2023-01-01', periods=n_samples, freq='D'), | |
| 'Sales': np.random.normal(1000, 200, n_samples), | |
| 'Profit': np.random.normal(150, 50, n_samples), | |
| 'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], n_samples), | |
| 'Region': np.random.choice(['North', 'South', 'East', 'West'], n_samples), | |
| 'Customer_Age': np.random.randint(18, 80, n_samples), | |
| 'Rating': np.random.uniform(1, 5, n_samples) | |
| } | |
| df = pd.DataFrame(data) | |
| df['Sales'] = np.where(df['Sales'] < 0, abs(df['Sales']), df['Sales']) | |
| df['Profit'] = np.where(df['Category'] == 'Electronics', df['Profit'] * 1.5, df['Profit']) | |
| # Add some Brazilian formatted numbers for demonstration | |
| df['Vendas_BR'] = df['Sales'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.')) | |
| df['Lucro_BR'] = df['Profit'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.')) | |
| return df | |
| def get_numeric_columns(df): | |
| """Get numeric columns from dataframe""" | |
| return df.select_dtypes(include=[np.number]).columns.tolist() | |
| def get_categorical_columns(df): | |
| """Get categorical columns from dataframe""" | |
| return df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| def create_download_link(df, filename="filtered_data.csv"): | |
| """Create download link for dataframe""" | |
| csv = df.to_csv(index=False) | |
| b64 = base64.b64encode(csv.encode()).decode() | |
| href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV File</a>' | |
| return href | |
| def main(): | |
| # Header | |
| st.markdown('<h1 class="main-header">📊 Data Analysis Dashboard</h1>', unsafe_allow_html=True) | |
| # Sidebar | |
| st.sidebar.title("🔧 Controls") | |
| st.sidebar.markdown("---") | |
| # File upload section | |
| st.sidebar.subheader("📁 Data Upload") | |
| uploaded_file = st.sidebar.file_uploader( | |
| "Choose a CSV file", | |
| type="csv", | |
| help="Upload a CSV file to analyze your data" | |
| ) | |
| use_sample = st.sidebar.checkbox( | |
| "Use Sample Data", | |
| value=True if uploaded_file is None else False, | |
| help="Check this to use built-in sample data for demonstration" | |
| ) | |
| # Brazilian number conversion option | |
| convert_brazilian = st.sidebar.checkbox( | |
| "🇧🇷 Auto-convert Brazilian Numbers", | |
| value=True, | |
| help="Automatically detect and convert Brazilian number format (xx.xxx.xxx,xx) to numeric" | |
| ) | |
| # Load data | |
| try: | |
| if uploaded_file is not None: | |
| df = pd.read_csv(uploaded_file) | |
| st.sidebar.success(f"✅ File uploaded successfully! ({len(df)} rows)") | |
| elif use_sample: | |
| df = load_sample_data() | |
| st.sidebar.info("📋 Using sample data") | |
| else: | |
| st.warning("Please upload a CSV file or use sample data to get started.") | |
| st.markdown(""" | |
| ### 🚀 Welcome to the Data Analysis Dashboard! | |
| This app helps you analyze and visualize your data with: | |
| - **Interactive charts** (bar, line, scatter, histogram) | |
| - **Dynamic filtering** and data exploration | |
| - **Statistical summaries** and insights | |
| - **Export capabilities** for data and visualizations | |
| - **🇧🇷 Brazilian number format support** (xx.xxx.xxx,xx) | |
| **To get started:** | |
| 1. Upload a CSV file using the sidebar, or | |
| 2. Check "Use Sample Data" to explore with demo data | |
| """) | |
| return | |
| # Apply Brazilian number conversion if enabled | |
| if convert_brazilian: | |
| df_original = df.copy() | |
| df, converted_cols = detect_and_convert_brazilian_numbers(df) | |
| if converted_cols: | |
| st.sidebar.success(f"🇧🇷 Converted {len(converted_cols)} columns from Brazilian format: {', '.join(converted_cols)}") | |
| except Exception as e: | |
| st.error(f"❌ Error loading file: {str(e)}") | |
| st.info("Please make sure your file is a valid CSV format.") | |
| return | |
| # Data preview section | |
| st.subheader("📋 Data Preview") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Total Rows", len(df)) | |
| with col2: | |
| st.metric("Total Columns", len(df.columns)) | |
| with col3: | |
| st.metric("Numeric Columns", len(get_numeric_columns(df))) | |
| with col4: | |
| st.metric("Text Columns", len(get_categorical_columns(df))) | |
| # Show data preview | |
| with st.expander("🔍 View Raw Data", expanded=False): | |
| st.dataframe(df.head(100), use_container_width=True) | |
| # Data summary | |
| with st.expander("📊 Statistical Summary", expanded=False): | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Numeric Columns") | |
| numeric_cols = get_numeric_columns(df) | |
| if numeric_cols: | |
| st.dataframe(df[numeric_cols].describe()) | |
| else: | |
| st.info("No numeric columns found") | |
| with col2: | |
| st.subheader("Categorical Columns") | |
| cat_cols = get_categorical_columns(df) | |
| if cat_cols: | |
| for col in cat_cols[:5]: # Show first 5 categorical columns | |
| st.write(f"**{col}:** {df[col].nunique()} unique values") | |
| if df[col].nunique() <= 10: | |
| st.write(df[col].value_counts().head()) | |
| else: | |
| st.info("No categorical columns found") | |
| # Show conversion info if Brazilian conversion was applied | |
| if convert_brazilian and 'converted_cols' in locals() and converted_cols: | |
| with st.expander("🇧🇷 Brazilian Number Conversion Details", expanded=False): | |
| st.write("**Converted Columns:**") | |
| for col in converted_cols: | |
| original_sample = df_original[col].dropna().head(3).tolist() | |
| converted_sample = df[col].dropna().head(3).tolist() | |
| st.write(f"**{col}:**") | |
| st.write(f" - Original: {original_sample}") | |
| st.write(f" - Converted: {converted_sample}") | |
| # Filtering section | |
| st.sidebar.markdown("---") | |
| st.sidebar.subheader("🔍 Data Filters") | |
| # Create a copy for filtering | |
| filtered_df = df.copy() | |
| # Numeric filters | |
| numeric_cols = get_numeric_columns(df) | |
| for col in numeric_cols: | |
| if df[col].dtype in ['int64', 'float64']: | |
| min_val = float(df[col].min()) | |
| max_val = float(df[col].max()) | |
| if min_val != max_val: | |
| selected_range = st.sidebar.slider( | |
| f"{col} Range", | |
| min_value=min_val, | |
| max_value=max_val, | |
| value=(min_val, max_val), | |
| help=f"Filter data by {col} values" | |
| ) | |
| filtered_df = filtered_df[ | |
| (filtered_df[col] >= selected_range[0]) & | |
| (filtered_df[col] <= selected_range[1]) | |
| ] | |
| # Categorical filters | |
| cat_cols = get_categorical_columns(df) | |
| for col in cat_cols: | |
| unique_values = df[col].unique().tolist() | |
| if len(unique_values) <= 50: # Only show filter for columns with reasonable number of unique values | |
| selected_values = st.sidebar.multiselect( | |
| f"Select {col}", | |
| options=unique_values, | |
| default=unique_values, | |
| help=f"Filter data by {col} categories" | |
| ) | |
| if selected_values: | |
| filtered_df = filtered_df[filtered_df[col].isin(selected_values)] | |
| # Show filtered data info | |
| if len(filtered_df) != len(df): | |
| st.sidebar.info(f"Filtered: {len(filtered_df)} of {len(df)} rows") | |
| # Visualization section | |
| st.markdown("---") | |
| st.subheader("📈 Data Visualization") | |
| # Chart type selection | |
| chart_type = st.selectbox( | |
| "Select Chart Type", | |
| ["Bar Chart", "Line Chart", "Scatter Plot", "Histogram", "Box Plot"], | |
| help="Choose the type of visualization" | |
| ) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]: | |
| x_column = st.selectbox( | |
| "X-axis Column", | |
| options=df.columns.tolist(), | |
| help="Select column for X-axis" | |
| ) | |
| else: | |
| x_column = st.selectbox( | |
| "Column to Analyze", | |
| options=numeric_cols, | |
| help="Select numeric column for histogram" | |
| ) | |
| with col2: | |
| if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]: | |
| y_column = st.selectbox( | |
| "Y-axis Column", | |
| options=numeric_cols, | |
| help="Select numeric column for Y-axis" | |
| ) | |
| else: | |
| y_column = None | |
| with col3: | |
| if chart_type in ["Bar Chart", "Scatter Plot", "Box Plot"]: | |
| color_column = st.selectbox( | |
| "Color/Group By (Optional)", | |
| options=[None] + cat_cols, | |
| help="Select column to group/color data" | |
| ) | |
| else: | |
| color_column = None | |
| # Create visualization | |
| if chart_type == "Bar Chart" and x_column and y_column: | |
| if x_column in cat_cols: | |
| # Aggregate data for categorical x-axis | |
| agg_df = filtered_df.groupby(x_column)[y_column].mean().reset_index() | |
| fig = px.bar( | |
| agg_df, | |
| x=x_column, | |
| y=y_column, | |
| title=f"Average {y_column} by {x_column}", | |
| color=color_column if color_column and color_column in agg_df.columns else None | |
| ) | |
| else: | |
| fig = px.bar( | |
| filtered_df, | |
| x=x_column, | |
| y=y_column, | |
| title=f"{y_column} vs {x_column}", | |
| color=color_column | |
| ) | |
| elif chart_type == "Line Chart" and x_column and y_column: | |
| fig = px.line( | |
| filtered_df, | |
| x=x_column, | |
| y=y_column, | |
| title=f"{y_column} vs {x_column}", | |
| color=color_column | |
| ) | |
| elif chart_type == "Scatter Plot" and x_column and y_column: | |
| fig = px.scatter( | |
| filtered_df, | |
| x=x_column, | |
| y=y_column, | |
| title=f"{y_column} vs {x_column}", | |
| color=color_column, | |
| size=y_column if y_column in numeric_cols else None | |
| ) | |
| elif chart_type == "Histogram" and x_column: | |
| fig = px.histogram( | |
| filtered_df, | |
| x=x_column, | |
| title=f"Distribution of {x_column}", | |
| nbins=30 | |
| ) | |
| elif chart_type == "Box Plot" and x_column and y_column: | |
| fig = px.box( | |
| filtered_df, | |
| x=x_column, | |
| y=y_column, | |
| title=f"{y_column} Distribution by {x_column}", | |
| color=color_column | |
| ) | |
| else: | |
| st.warning("Please select appropriate columns for the chosen chart type.") | |
| return | |
| # Update layout for better appearance | |
| fig.update_layout( | |
| height=500, | |
| showlegend=True, | |
| title_x=0.5, | |
| font=dict(size=12) | |
| ) | |
| # Display chart | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Download section | |
| st.markdown("---") | |
| st.subheader("💾 Download Options") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("**Download Filtered Data**") | |
| if st.button("Generate CSV Download Link"): | |
| download_link = create_download_link(filtered_df, f"filtered_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv") | |
| st.markdown(download_link, unsafe_allow_html=True) | |
| with col2: | |
| st.markdown("**Download Chart**") | |
| if st.button("Download Chart as HTML"): | |
| html_string = fig.to_html(include_plotlyjs='cdn') | |
| st.download_button( | |
| label="Download HTML", | |
| data=html_string, | |
| file_name=f"chart_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html", | |
| mime="text/html" | |
| ) | |
| # Additional insights | |
| if len(filtered_df) > 0: | |
| st.markdown("---") | |
| st.subheader("🔍 Quick Insights") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("**Data Overview**") | |
| st.write(f"• Total records: {len(filtered_df):,}") | |
| st.write(f"• Columns: {len(filtered_df.columns)}") | |
| if numeric_cols: | |
| st.write(f"• Numeric columns: {len(numeric_cols)}") | |
| for col in numeric_cols[:3]: | |
| mean_val = filtered_df[col].mean() | |
| st.write(f" - {col}: avg = {mean_val:.2f}") | |
| with col2: | |
| st.markdown("**Missing Data**") | |
| missing_data = filtered_df.isnull().sum() | |
| if missing_data.sum() > 0: | |
| for col, missing in missing_data.items(): | |
| if missing > 0: | |
| pct = (missing / len(filtered_df)) * 100 | |
| st.write(f"• {col}: {missing} ({pct:.1f}%)") | |
| else: | |
| st.write("✅ No missing data found") | |
| if __name__ == "__main__": | |
| main() |