Spaces:
Sleeping
Sleeping
| import os | |
| from collections import Counter | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from datetime import datetime | |
| REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}" | |
| os.makedirs(REPORT_DIR, exist_ok=True) | |
| def save_plot(fig, filename): | |
| fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight') | |
| plt.close(fig) | |
| def plot_heatmap(df, title): | |
| # Calculate the percentage of null values for each column | |
| null_percentages = df.isnull().mean() * 100 | |
| plt.figure(figsize=(12, 8)) | |
| sns.heatmap(null_percentages.to_frame().T, cbar=True, cmap='Reds', annot=True, fmt='.1f') | |
| plt.title(title) | |
| plt.ylabel('Percentage of Missing Values') | |
| plt.tight_layout() | |
| save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png') | |
| def plot_column_schemas(df): | |
| # Get the data types of all columns | |
| data_types = df.dtypes.astype(str).tolist() | |
| data_types = [dtype.capitalize() for dtype in data_types] | |
| # Count the occurrences of each data type | |
| type_counts = Counter(data_types) | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| # Generate a color palette with as many colors as there are bars | |
| colors = plt.cm.tab20(np.linspace(0, 1, len(type_counts))) | |
| # Plot the bars | |
| bars = ax.bar(type_counts.keys(), type_counts.values(), color=colors) | |
| ax.set_title('Column Data Types') | |
| ax.set_xlabel('Data Type') | |
| ax.set_ylabel('Count') | |
| # Add value labels on top of each bar | |
| for bar in bars: | |
| height = bar.get_height() | |
| ax.text(bar.get_x() + bar.get_width() / 2., height, | |
| f'{int(height)}', | |
| ha='center', va='bottom') | |
| plt.xticks(rotation=45) | |
| plt.tight_layout() | |
| save_plot(fig, 'column_schemas.png') | |
| def plot_nonconforming_cells(nonconforming_cells): | |
| # Ensure that nonconforming_cells is a dictionary | |
| if isinstance(nonconforming_cells, dict): | |
| # Proceed with plotting if it's a dictionary | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| # Generate a color palette with as many colors as there are bars | |
| colors = plt.cm.rainbow(np.linspace(0, 1, len(nonconforming_cells))) | |
| # Plot the bars | |
| bars = ax.bar(list(nonconforming_cells.keys()), list(nonconforming_cells.values()), color=colors) | |
| ax.set_title('Nonconforming Cells by Column') | |
| ax.set_xlabel('Columns') | |
| ax.set_ylabel('Number of Nonconforming Cells') | |
| plt.xticks(rotation=90) | |
| # Add value labels on top of each bar | |
| for bar in bars: | |
| height = bar.get_height() | |
| ax.text(bar.get_x() + bar.get_width() / 2., height, | |
| f'{height:,}', | |
| ha='center', va='bottom') | |
| save_plot(fig, 'nonconforming_cells.png') | |
| else: | |
| print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.") | |
| def plot_column_distributions(cleaned_df, primary_key_column): | |
| print("Plotting distribution charts for numeric columns in the cleaned DataFrame...") | |
| numeric_columns = cleaned_df.select_dtypes(include=[np.number]).columns.tolist() | |
| numeric_columns = [col for col in numeric_columns if col != primary_key_column] | |
| num_columns = len(numeric_columns) | |
| if num_columns == 0: | |
| print("No numeric columns found in the cleaned DataFrame for distribution plots.") | |
| return | |
| # Create subplots for distributions | |
| ncols = 3 | |
| nrows = (num_columns + ncols - 1) // ncols # Ceiling division | |
| fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5 * nrows)) | |
| axes = axes.flatten() if num_columns > 1 else [axes] | |
| for i, column in enumerate(numeric_columns): | |
| cleaned_data = cleaned_df[column].dropna() | |
| sns.histplot(cleaned_data, ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.7) | |
| axes[i].set_title(f'{column} - Distribution After Cleaning') | |
| axes[i].legend() | |
| # Remove any unused subplots | |
| for j in range(i + 1, len(axes)): | |
| fig.delaxes(axes[j]) | |
| plt.tight_layout() | |
| save_plot(fig, 'distributions_after_cleaning.png') | |
| def plot_boxplot_with_outliers(original_df, primary_key_column): | |
| print("Plotting boxplots for numeric columns in the original DataFrame...") | |
| numeric_columns = original_df.select_dtypes(include=[np.number]).columns.tolist() | |
| numeric_columns = [col for col in numeric_columns if col != primary_key_column] | |
| num_columns = len(numeric_columns) | |
| if num_columns == 0: | |
| print("No numeric columns found in the original DataFrame for boxplots.") | |
| return | |
| # Create subplots based on the number of numeric columns | |
| ncols = 3 | |
| nrows = (num_columns + ncols - 1) // ncols # Ceiling division | |
| fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(18, 5 * nrows)) | |
| axes = axes.flatten() if num_columns > 1 else [axes] | |
| for i, column in enumerate(numeric_columns): | |
| data = original_df[column].dropna() | |
| sns.boxplot(x=data, ax=axes[i], color='blue', orient='h') | |
| axes[i].set_title(f'Boxplot of {column} (Before Cleaning)') | |
| # Remove any unused subplots | |
| for j in range(i + 1, len(axes)): | |
| fig.delaxes(axes[j]) | |
| plt.tight_layout() | |
| save_plot(fig, 'boxplots_before_cleaning.png') | |
| def plot_correlation_heatmap(df, primary_key_column): | |
| numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist() | |
| numeric_columns = [col for col in numeric_columns if col != primary_key_column] | |
| if not numeric_columns: | |
| print("No numeric columns found for correlation heatmap.") | |
| return | |
| corr_matrix = df[numeric_columns].corr() | |
| plt.figure(figsize=(15, 10)) | |
| sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar_kws={'label': 'Correlation'}) | |
| plt.title('Correlation Heatmap') | |
| plt.tight_layout() | |
| save_plot(plt.gcf(), 'correlation_heatmap.png') | |
| def plot_process_times(process_times): | |
| # Convert seconds to minutes | |
| process_times_minutes = {k: v / 60 for k, v in process_times.items()} | |
| # Separate main processes and column cleaning processes | |
| main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")} | |
| column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")} | |
| # Create the plot | |
| fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10)) | |
| # Plot main processes | |
| bars1 = ax1.bar(main_processes.keys(), main_processes.values()) | |
| ax1.set_title('Main Process Times') | |
| ax1.set_ylabel('Time (minutes)') | |
| ax1.tick_params(axis='x', rotation=45) | |
| # Plot column cleaning processes | |
| bars2 = ax2.bar(column_processes.keys(), column_processes.values()) | |
| ax2.set_title('Column Cleaning Times') | |
| ax2.set_ylabel('Time (minutes)') | |
| ax2.tick_params(axis='x', rotation=90) | |
| # Add value labels on top of each bar with 3 decimal places | |
| for ax, bars in zip([ax1, ax2], [bars1, bars2]): | |
| for bar in bars: | |
| height = bar.get_height() | |
| ax.text(bar.get_x() + bar.get_width() / 2., height, | |
| f'{height:.4f}', ha='center', va='bottom') | |
| # Add total time to the plot with 3 decimal places | |
| total_time = sum(process_times_minutes.values()) | |
| fig.suptitle(f'Process Times (Total: {total_time:.3f} minutes)', fontsize=16) | |
| plt.tight_layout() | |
| save_plot(fig, 'process_times.png') | |
| def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, | |
| removed_rows, primary_key_column): | |
| os.makedirs(REPORT_DIR, exist_ok=True) | |
| sns.set_style("whitegrid") | |
| plt.rcParams['figure.dpi'] = 400 | |
| print("Plotting nonconforming cells before cleaning...") | |
| plot_nonconforming_cells(nonconforming_cells_before) | |
| print("Plotting column distributions...") | |
| plot_column_distributions(cleaned_df, primary_key_column) | |
| print("Plotting boxplots for original data...") | |
| plot_boxplot_with_outliers(original_df, primary_key_column) | |
| print("Plotting process times...") | |
| plot_process_times(process_times) | |
| print("Plotting heatmaps...") | |
| plot_heatmap(original_df, "Missing Values Before Cleaning") | |
| print("Plotting correlation heatmap...") | |
| plot_correlation_heatmap(cleaned_df, primary_key_column) | |
| print("Plotting column schemas...") | |
| plot_column_schemas(cleaned_df) | |
| print(f"All visualization reports saved in directory: {REPORT_DIR}") |