import gradio as gr import pandas as pd import sweetviz as sv import tempfile import os import category_encoders as ce import umap import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from autoviz.AutoViz_Class import AutoViz_Class import shutil import warnings import io import base64 from pathlib import Path import matplotlib matplotlib.use('Agg') warnings.filterwarnings('ignore') class DataAnalyzer: def __init__(self): self.temp_dir = tempfile.mkdtemp() self.df = None self.AV = AutoViz_Class() self.plots_memory = {} # Store plots in memory def save_plot_to_memory(self, fig, plot_name): """Save matplotlib figure to memory as base64""" buf = io.BytesIO() fig.savefig(buf, format='png', bbox_inches='tight') buf.seek(0) img_str = base64.b64encode(buf.getvalue()).decode() self.plots_memory[plot_name] = f'data:image/png;base64,{img_str}' plt.close(fig) def generate_basic_plots(self, df): """Generate basic matplotlib plots""" # Numeric columns distribution numeric_cols = df.select_dtypes(include=['number']).columns for col in numeric_cols: fig, ax = plt.subplots(figsize=(10, 6)) df[col].hist(bins=30, ax=ax) ax.set_title(f'Distribution of {col}') self.save_plot_to_memory(fig, f'dist_{col}') # Box plot fig, ax = plt.subplots(figsize=(10, 6)) df.boxplot(column=col, ax=ax) ax.set_title(f'Box Plot of {col}') self.save_plot_to_memory(fig, f'box_{col}') # Categorical columns categorical_cols = df.select_dtypes(include=['category', 'object']).columns for col in categorical_cols: if df[col].nunique() < 20: # Only for columns with reasonable number of categories fig, ax = plt.subplots(figsize=(12, 6)) df[col].value_counts().plot(kind='bar', ax=ax) ax.set_title(f'Distribution of {col}') plt.xticks(rotation=45) self.save_plot_to_memory(fig, f'cat_{col}') # Correlation matrix for numeric columns if len(numeric_cols) > 1: fig, ax = plt.subplots(figsize=(10, 8)) correlation_matrix = df[numeric_cols].corr() im = ax.imshow(correlation_matrix) ax.set_xticks(range(len(numeric_cols))) ax.set_yticks(range(len(numeric_cols))) ax.set_xticklabels(numeric_cols, rotation=45) ax.set_yticklabels(numeric_cols) plt.colorbar(im) ax.set_title('Correlation Matrix') self.save_plot_to_memory(fig, 'correlation_matrix') def generate_sweetviz_report(self, df): if df is None: return "Please upload a dataset first" self.df = df report = sv.analyze(df) report_path = os.path.join(self.temp_dir, "report.html") report.show_html(report_path, open_browser=False) with open(report_path, 'r', encoding='utf-8') as f: html_content = f.read() html_with_table = f"""
{html_content}
""" os.remove(report_path) return html_with_table def generate_autoviz_report(self, df): if df is None: return "Please upload a dataset first" try: # Preprocess the dataframe df = df.copy() # Convert 'value' column to numeric if possible if 'value' in df.columns: df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce') # Sample if needed if len(df) > 5000: df = df.sample(n=5000, random_state=42) # Generate basic plots self.generate_basic_plots(df) # Generate summary statistics numeric_cols = df.select_dtypes(include=['number']).columns categorical_cols = df.select_dtypes(include=['category', 'object']).columns numeric_stats = df[numeric_cols].describe().round(2) if len(numeric_cols) > 0 else pd.DataFrame() categorical_stats = df[categorical_cols].describe() if len(categorical_cols) > 0 else pd.DataFrame() # Create HTML content with styling html_content = """ """ # Add summary statistics html_content += f"""

Data Analysis Report

Dataset Overview

Total Rows: {len(df)}

Total Columns: {len(df.columns)}

Numeric Variables Summary

{numeric_stats.to_html(classes='table table-striped')}

Categorical Variables Summary

{categorical_stats.to_html(classes='table table-striped')}
""" # Add plots from memory for plot_name, plot_data in self.plots_memory.items(): html_content += f"""

{plot_name.replace('_', ' ').title()}

{plot_name}
""" html_content += "
" return html_content except Exception as e: import traceback error_message = f"""

Error in Analysis

Error details: {str(e)}

Stack trace:

{traceback.format_exc()}
""" return error_message def create_interface(): analyzer = DataAnalyzer() with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(""" # Data Analysis Dashboard This dashboard provides comprehensive data analysis and visualization capabilities. """) # Store the dataframe in a state variable current_df = gr.State(None) with gr.Tabs(): # First Tab: Data Upload & Preview with gr.TabItem("Data Upload & Preview"): with gr.Row(): with gr.Column(scale=2): file_input = gr.File( label="Upload CSV File", file_types=[".csv"], file_count="single" ) with gr.Column(scale=1): gr.Markdown(""" ### Upload Instructions 1. Select a CSV file 2. File will be automatically loaded 3. Preview will appear below """) with gr.Row(): data_info = gr.Markdown("No data uploaded yet") with gr.Row(): data_preview = gr.Dataframe( label="Data Preview", interactive=False, wrap=True ) def load_data(file): if file is None: return "No data uploaded yet", None, None try: df = pd.read_csv(file.name) info_text = f""" ### Dataset Information - Rows: {len(df)} - Columns: {len(df.columns)} - Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB - Column Types: {dict(df.dtypes.value_counts())} """ return info_text, df.head(10), df except Exception as e: return f"Error loading file: {str(e)}", None, None file_input.change( fn=load_data, inputs=[file_input], outputs=[data_info, data_preview, current_df] ) # Second Tab: Sweetviz Analysis with gr.TabItem("Sweetviz Analysis"): with gr.Row(): with gr.Column(scale=2): sweetviz_button = gr.Button( "Generate Sweetviz Report", variant="primary" ) with gr.Column(scale=1): gr.Markdown(""" ### Sweetviz Analysis Features - Comprehensive data profiling - Statistical analysis - Feature correlations - Missing value analysis """) with gr.Row(): sweetviz_output = gr.HTML( label="Sweetviz Report", value="Click the button above to generate the report" ) def generate_sweetviz(df): if df is None: return "Please upload a dataset first" try: return analyzer.generate_sweetviz_report(df) except Exception as e: return f"Error generating Sweetviz report: {str(e)}" sweetviz_button.click( fn=generate_sweetviz, inputs=[current_df], outputs=[sweetviz_output] ) # Third Tab: Visual Analysis with gr.TabItem("Visual Analysis"): with gr.Row(): with gr.Column(scale=2): viz_button = gr.Button( "Generate Visualizations", variant="primary" ) with gr.Column(scale=1): gr.Markdown(""" ### Visualization Features - Distribution plots - Correlation analysis - Categorical variable analysis - Statistical summaries """) with gr.Row(): viz_output = gr.HTML( label="Visualization Report", value="Click the button above to generate visualizations" ) def generate_viz(df): if df is None: return "Please upload a dataset first" try: return analyzer.generate_autoviz_report(df) except Exception as e: return f"Error generating visualizations: {str(e)}" viz_button.click( fn=generate_viz, inputs=[current_df], outputs=[viz_output] ) return demo if __name__ == "__main__": demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True, share=False # Set to True if you want to create a public link )