Spaces:

reab5555
/

AI-Data-Cleaner

Running

App Files Files Community

reab5555 commited on Sep 12, 2024

Commit

deab6bd

verified ·

1 Parent(s): 536f053

Update report.py

Browse files

Files changed (1) hide show

report.py +205 -207

report.py CHANGED Viewed

@@ -1,208 +1,206 @@
-import os
-import numpy as np
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-from datetime import datetime
-REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-os.makedirs(REPORT_DIR, exist_ok=True)
-def save_plot(fig, filename):
-    fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
-    plt.close(fig)
-def plot_heatmap(df, title):
-    plt.figure(figsize=(12, 8))
-    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
-    plt.title(title)
-    plt.tight_layout()
-    save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
-def plot_valid_data_percentage(original_df, cleaned_df):
-    original_valid = (original_df.notna().sum() / len(original_df)) * 100
-    cleaned_valid = (cleaned_df.notna().sum() / len(cleaned_df)) * 100
-    # Create a DataFrame with both original and cleaned percentages
-    combined_df = pd.DataFrame({
-        'Original': original_valid,
-        'Cleaned': cleaned_valid
-    }).fillna(0)  # Fill NaN with 0 for columns that were removed
-    plt.figure(figsize=(15, 8))
-    combined_df.plot(kind='bar', width=0.8, alpha=0.8)
-    plt.xlabel('Columns')
-    plt.ylabel('Percentage of Valid Data')
-    plt.title('Percentage of Valid Data Before and After Cleaning')
-    plt.xticks(rotation=90)
-    plt.legend(['Before Cleaning', 'After Cleaning'])
-    # Add percentage labels on the bars
-    for i, (index, row) in enumerate(combined_df.iterrows()):
-        plt.text(i, row['Original'], f'{row["Original"]:.1f}%', ha='center', va='bottom')
-        if row['Cleaned'] > 0:  # Only add label if column exists in cleaned data
-            plt.text(i, row['Cleaned'], f'{row["Cleaned"]:.1f}%', ha='center', va='bottom')
-    plt.tight_layout()
-    plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
-    plt.close()
-def plot_column_schemas(df):
-    schemas = df.dtypes.astype(str).value_counts()
-    fig, ax = plt.subplots(figsize=(10, 6))
-    sns.barplot(x=schemas.index, y=schemas.values, ax=ax)
-    ax.set_title('Column Data Types')
-    ax.set_xlabel('Data Type')
-    ax.set_ylabel('Count')
-    save_plot(fig, 'column_schemas.png')
-def plot_nonconforming_cells(nonconforming_cells):
-    # Ensure that nonconforming_cells is a dictionary
-    if isinstance(nonconforming_cells, dict):
-        # Proceed with plotting if it's a dictionary
-        fig, ax = plt.subplots(figsize=(12, 6))
-        sns.barplot(x=list(nonconforming_cells.keys()), y=list(nonconforming_cells.values()), ax=ax)
-        ax.set_title('Nonconforming Cells by Column')
-        ax.set_xlabel('Columns')
-        ax.set_ylabel('Number of Nonconforming Cells')
-        plt.xticks(rotation=90)
-        save_plot(fig, 'nonconforming_cells.png')
-    else:
-        print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
-def plot_column_distributions(original_df, cleaned_df):
-    numeric_columns = original_df.select_dtypes(include=[np.number]).columns
-    num_columns = len(numeric_columns)
-    if num_columns == 0:
-        print("No numeric columns found for distribution plots.")
-        return
-    # Create subplots for distributions
-    fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
-    axes = axes.flatten() if num_columns > 1 else [axes]
-    for i, column in enumerate(numeric_columns):
-        if column in cleaned_df.columns:
-            sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning', alpha=0.5)
-            sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.5)
-            axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
-            axes[i].legend()
-    # Remove any unused subplots
-    for j in range(i + 1, len(axes)):
-        fig.delaxes(axes[j])
-    plt.tight_layout()
-    save_plot(fig, 'distributions_before_after_cleaning.png')
-def plot_boxplot_with_outliers(df):
-    print("Plotting boxplots with outliers...")
-    numeric_columns = df.select_dtypes(include=[np.number]).columns
-    num_columns = len(numeric_columns)
-    if num_columns == 0:
-        print("No numeric columns found for boxplot.")
-        return
-    # Create subplots based on the number of numeric columns
-    fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
-    axes = axes.flatten() if num_columns > 1 else [axes]
-    for i, column in enumerate(numeric_columns):
-        sns.boxplot(x=df[column], ax=axes[i])
-        axes[i].set_title(f'Boxplot of {column} with Outliers')
-    # Remove any unused subplots
-    for j in range(i + 1, len(axes)):
-        fig.delaxes(axes[j])
-    plt.tight_layout()
-    save_plot(fig, 'boxplots_with_outliers.png')
-def plot_correlation_heatmap(df):
-    # Select only numeric, float, and integer columns
-    numeric_df = df.select_dtypes(include=[np.number])
-    # Compute the correlation matrix
-    correlation_matrix = numeric_df.corr()
-    # Plot the heatmap
-    fig, ax = plt.subplots(figsize=(15, 10))
-    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
-    ax.set_title('Correlation Heatmap')
-    save_plot(fig, 'correlation_heatmap.png')
-def plot_process_times(process_times):
-    # Convert seconds to minutes
-    process_times_minutes = {k: v / 60 for k, v in process_times.items()}
-    # Separate main processes and column cleaning processes
-    main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
-    column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
-    # Create the plot
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
-    # Plot main processes
-    bars1 = ax1.bar(main_processes.keys(), main_processes.values())
-    ax1.set_title('Main Process Times')
-    ax1.set_ylabel('Time (minutes)')
-    ax1.tick_params(axis='x', rotation=45)
-    # Plot column cleaning processes
-    bars2 = ax2.bar(column_processes.keys(), column_processes.values())
-    ax2.set_title('Column Cleaning Times')
-    ax2.set_ylabel('Time (minutes)')
-    ax2.tick_params(axis='x', rotation=90)
-    # Add value labels on top of each bar
-    for ax, bars in zip([ax1, ax2], [bars1, bars2]):
-        for bar in bars:
-            height = bar.get_height()
-            ax.text(bar.get_x() + bar.get_width() / 2., height,
-                    f'{height:.2f}', ha='center', va='bottom')
-    # Add total time to the plot
-    total_time = sum(process_times_minutes.values())
-    fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
-    plt.tight_layout()
-    save_plot(fig, 'process_times.png')
-def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows):
-    os.makedirs(REPORT_DIR, exist_ok=True)
-    sns.set_style("whitegrid")
-    plt.rcParams['figure.dpi'] = 400
-    print("Plotting valid data percentages...")
-    plot_valid_data_percentage(original_df, cleaned_df)
-    print("Plotting column schemas...")
-    plot_column_schemas(cleaned_df)
-    print("Plotting nonconforming cells before cleaning...")
-    plot_nonconforming_cells(nonconforming_cells_before)
-    print("Plotting column distributions...")
-    plot_column_distributions(original_df, cleaned_df)
-    print("Plotting process times...")
-    plot_process_times(process_times)
-    print("Plotting heatmaps...")
-    plot_heatmap(original_df, "Missing Values Before Cleaning")
-    print("Plotting correlation heatmap...")
-    plot_correlation_heatmap(cleaned_df)
     print(f"All visualization reports saved in directory: {REPORT_DIR}")

+import os
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from datetime import datetime
+REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+os.makedirs(REPORT_DIR, exist_ok=True)
+def save_plot(fig, filename):
+    fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
+    plt.close(fig)
+def plot_heatmap(df, title):
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
+    plt.title(title)
+    plt.tight_layout()
+    save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
+def plot_valid_data_percentage(original_df, cleaned_df):
+    original_valid = (original_df.notna().sum() / len(original_df)) * 100
+    cleaned_valid = (cleaned_df.notna().sum() / len(cleaned_df)) * 100
+    plt.figure(figsize=(15, 8))
+    x = range(len(original_valid))
+    width = 0.35
+    plt.bar(x, original_valid, width, label='Before Cleaning', alpha=0.8)
+    plt.bar([i + width for i in x], cleaned_valid, width, label='After Cleaning', alpha=0.8)
+    plt.xlabel('Columns')
+    plt.ylabel('Percentage of Valid Data')
+    plt.title('Percentage of Valid Data Before and After Cleaning')
+    plt.xticks([i + width/2 for i in x], original_valid.index, rotation=90)
+    plt.legend()
+    # Add percentage labels on the bars with smaller font size
+    for i, v in enumerate(original_valid):
+        plt.text(i, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
+    for i, v in enumerate(cleaned_valid):
+        plt.text(i + width, v, f'{v:.1f}%', ha='center', va='bottom', fontsize=6)
+    plt.tight_layout()
+    plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
+    plt.close()
+def plot_column_schemas(df):
+    schemas = df.dtypes.astype(str).value_counts()
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.barplot(x=schemas.index, y=schemas.values, ax=ax)
+    ax.set_title('Column Data Types')
+    ax.set_xlabel('Data Type')
+    ax.set_ylabel('Count')
+    save_plot(fig, 'column_schemas.png')
+def plot_nonconforming_cells(nonconforming_cells):
+    # Ensure that nonconforming_cells is a dictionary
+    if isinstance(nonconforming_cells, dict):
+        # Proceed with plotting if it's a dictionary
+        fig, ax = plt.subplots(figsize=(12, 6))
+        sns.barplot(x=list(nonconforming_cells.keys()), y=list(nonconforming_cells.values()), ax=ax)
+        ax.set_title('Nonconforming Cells by Column')
+        ax.set_xlabel('Columns')
+        ax.set_ylabel('Number of Nonconforming Cells')
+        plt.xticks(rotation=90)
+        save_plot(fig, 'nonconforming_cells.png')
+    else:
+        print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
+def plot_column_distributions(original_df, cleaned_df):
+    numeric_columns = original_df.select_dtypes(include=[np.number]).columns
+    num_columns = len(numeric_columns)
+    if num_columns == 0:
+        print("No numeric columns found for distribution plots.")
+        return
+    # Create subplots for distributions
+    fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
+    axes = axes.flatten() if num_columns > 1 else [axes]
+    for i, column in enumerate(numeric_columns):
+        if column in cleaned_df.columns:
+            sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning', alpha=0.5)
+            sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.5)
+            axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
+            axes[i].legend()
+    # Remove any unused subplots
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+    plt.tight_layout()
+    save_plot(fig, 'distributions_before_after_cleaning.png')
+def plot_boxplot_with_outliers(df):
+    print("Plotting boxplots with outliers...")
+    numeric_columns = df.select_dtypes(include=[np.number]).columns
+    num_columns = len(numeric_columns)
+    if num_columns == 0:
+        print("No numeric columns found for boxplot.")
+        return
+    # Create subplots based on the number of numeric columns
+    fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
+    axes = axes.flatten() if num_columns > 1 else [axes]
+    for i, column in enumerate(numeric_columns):
+        sns.boxplot(x=df[column], ax=axes[i])
+        axes[i].set_title(f'Boxplot of {column} with Outliers')
+    # Remove any unused subplots
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+    plt.tight_layout()
+    save_plot(fig, 'boxplots_with_outliers.png')
+def plot_correlation_heatmap(df):
+    # Select only numeric, float, and integer columns
+    numeric_df = df.select_dtypes(include=[np.number])
+    # Compute the correlation matrix
+    correlation_matrix = numeric_df.corr()
+    # Plot the heatmap
+    fig, ax = plt.subplots(figsize=(15, 10))
+    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
+    ax.set_title('Correlation Heatmap')
+    save_plot(fig, 'correlation_heatmap.png')
+def plot_process_times(process_times):
+    # Convert seconds to minutes
+    process_times_minutes = {k: v / 60 for k, v in process_times.items()}
+    # Separate main processes and column cleaning processes
+    main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
+    column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
+    # Create the plot
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
+    # Plot main processes
+    bars1 = ax1.bar(main_processes.keys(), main_processes.values())
+    ax1.set_title('Main Process Times')
+    ax1.set_ylabel('Time (minutes)')
+    ax1.tick_params(axis='x', rotation=45)
+    # Plot column cleaning processes
+    bars2 = ax2.bar(column_processes.keys(), column_processes.values())
+    ax2.set_title('Column Cleaning Times')
+    ax2.set_ylabel('Time (minutes)')
+    ax2.tick_params(axis='x', rotation=90)
+    # Add value labels on top of each bar
+    for ax, bars in zip([ax1, ax2], [bars1, bars2]):
+        for bar in bars:
+            height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width() / 2., height,
+                    f'{height:.2f}', ha='center', va='bottom')
+    # Add total time to the plot
+    total_time = sum(process_times_minutes.values())
+    fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
+    plt.tight_layout()
+    save_plot(fig, 'process_times.png')
+def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows):
+    os.makedirs(REPORT_DIR, exist_ok=True)
+    sns.set_style("whitegrid")
+    plt.rcParams['figure.dpi'] = 400
+    print("Plotting valid data percentages...")
+    plot_valid_data_percentage(original_df, cleaned_df)
+    print("Plotting column schemas...")
+    plot_column_schemas(cleaned_df)
+    print("Plotting nonconforming cells before cleaning...")
+    plot_nonconforming_cells(nonconforming_cells_before)
+    print("Plotting column distributions...")
+    plot_column_distributions(original_df, cleaned_df)
+    print("Plotting process times...")
+    plot_process_times(process_times)
+    print("Plotting heatmaps...")
+    plot_heatmap(original_df, "Missing Values Before Cleaning")
+    print("Plotting correlation heatmap...")
+    plot_correlation_heatmap(cleaned_df)
     print(f"All visualization reports saved in directory: {REPORT_DIR}")