Spaces:
Running
Running
import pandas as pd | |
from sklearn.impute import KNNImputer | |
import gradio as gr | |
def get_clean_columns(df): | |
df.columns = df.columns.str.replace('\xa0', ' ', regex=True).str.strip() | |
return df | |
def get_imputable_columns(file_obj): | |
"""Reads a CSV and returns a list of numeric columns with NaNs eligible for user selection and a warning.""" | |
if file_obj is None: | |
return gr.update(choices=[], value=[]), gr.update(value="", visible=False) | |
try: | |
df = pd.read_csv(file_obj) | |
df = get_clean_columns(df) | |
all_numeric_cols = df.select_dtypes(include=['number']).columns.tolist() | |
cols_with_nan = df[all_numeric_cols].columns[df[all_numeric_cols].isnull().any()].tolist() | |
if not cols_with_nan: | |
return gr.update(choices=[], value=[]), gr.update(value="⚠️ No numeric columns with missing values found.", visible=True) | |
return gr.update(choices=cols_with_nan, value=cols_with_nan), gr.update(value="", visible=False) | |
except Exception as e: | |
return gr.update(choices=[], value=[]), gr.update(value=f"⚠️ Could not read file: {e}", visible=True) | |
def impute_and_analyze_data(input_file, selected_columns, n_neighbors=5): | |
if input_file is None: | |
raise gr.Error("Please upload a CSV file.") | |
if not selected_columns: | |
raise gr.Error("Please select at least one column for imputation.") | |
try: | |
df = pd.read_csv(input_file) | |
df = get_clean_columns(df) | |
df_original = df.copy() | |
stats_data = [] | |
replacements_data = [] | |
valid_selected_columns = [ | |
col for col in selected_columns | |
if col in df.columns and pd.api.types.is_numeric_dtype(df[col]) | |
] | |
if not valid_selected_columns: | |
raise gr.Error("No valid numeric columns selected for imputation.") | |
original_nan_counts = { | |
col: df_original[col].isnull().sum() | |
for col in valid_selected_columns | |
} | |
for col in valid_selected_columns: | |
stats_data.append({ | |
'Metric': 'Before Imputation', | |
'Column': col, | |
'Mean': df_original[col].mean(), | |
'Median': df_original[col].median(), | |
'Std Dev': df_original[col].std(), | |
'Min': df_original[col].min(), | |
'Max': df_original[col].max() | |
}) | |
df_for_imputation = df[valid_selected_columns].copy() | |
df_for_imputation = df_for_imputation.dropna(axis=1, how='all') | |
imputation_target_cols = df_for_imputation.columns.tolist() | |
if not imputation_target_cols: | |
output_path = f"imputed_data_{input_file.split('/')[-1]}" | |
df.to_csv(output_path, index=False) | |
stats_df = pd.DataFrame(stats_data) | |
replacement_df = pd.DataFrame(replacements_data) | |
return output_path, stats_df, replacement_df, gr.update(value="⚠️ No columns eligible for imputation after preprocessing.", visible=True) | |
means = df_for_imputation.mean() | |
stds = df_for_imputation.std() | |
df_std = (df_for_imputation - means) / stds | |
imputer = KNNImputer(n_neighbors=n_neighbors) | |
df_imputed_std = pd.DataFrame( | |
imputer.fit_transform(df_std), | |
columns=imputation_target_cols, | |
index=df_std.index | |
) | |
df_imputed = df_imputed_std * stds + means | |
df_final = df.copy() | |
after_nan_counts = {} | |
for col in imputation_target_cols: | |
df_final[col] = df_imputed[col] | |
after_nan_counts[col] = df_final[col].isnull().sum() | |
replacements = original_nan_counts.get(col, 0) - after_nan_counts.get(col, 0) | |
if replacements < 0: | |
replacements = original_nan_counts.get(col, 0) | |
replacements_data.append({ | |
'Column': col, | |
'Original NaNs': original_nan_counts.get(col, 0), | |
'Replacements Made': replacements | |
}) | |
stats_data.append({ | |
'Metric': 'After Imputation', | |
'Column': col, | |
'Mean': df_final[col].mean(), | |
'Median': df_final[col].median(), | |
'Std Dev': df_final[col].std(), | |
'Min': df_final[col].min(), | |
'Max': df_final[col].max() | |
}) | |
output_path = f"imputed_data_{input_file.split('/')[-1]}" | |
df_final.to_csv(output_path, index=False) | |
stats_df = pd.DataFrame(stats_data) | |
stats_df[["Mean", "Median", "Std Dev"]] = stats_df[["Mean", "Median", "Std Dev"]].round(4) | |
stats_df['order_key'] = stats_df['Column'] + stats_df['Metric'] | |
stats_df = stats_df.sort_values(by=['Column', 'Metric'], ascending=[True, False]).drop(columns='order_key') | |
replacement_df = pd.DataFrame(replacements_data) | |
return output_path, stats_df, replacement_df, gr.update(value="", visible=False) | |
except pd.errors.EmptyDataError: | |
raise gr.Error("The uploaded file is empty.") | |
except pd.errors.ParserError: | |
raise gr.Error("Could not parse the CSV file. Please ensure it's a valid CSV.") | |
except Exception as e: | |
raise gr.Error(f"An unexpected error occurred: {e}") | |
# Create .ris citation file | |
ris_content = """TY - COMP | |
T1 - Missing value analysis with KNN Impute | |
AU - Mat Roni, S. | |
PY - 2025 | |
VL - 1.0 | |
PB - Hugging Face | |
UR - https://huggingface.co/spaces/pvaluedotone/knn_impute | |
ER - | |
""" | |
with open("citation.ris", "w") as f: | |
f.write(ris_content) | |
# --- Gradio Interface --- | |
with gr.Blocks() as app: | |
gr.Markdown("# Missing value analysis with KNN Imputer") | |
gr.Markdown("Upload your CSV dataset, select columns for imputation, and view statistics and imputation details.") | |
gr.Markdown("**Citation:** Mat Roni, S. (2025). *Missing value analysis with KNN Impute* (version 1.0) [software]. [https://huggingface.co/spaces/pvaluedotone/knn_impute](https://huggingface.co/spaces/pvaluedotone/knn_impute)") | |
gr.File(value="citation.ris", label="Download citation (.ris)", interactive=False) | |
with gr.Row(): | |
file_input = gr.File(label="Upload CSV file (.csv)", type="filepath") | |
n_neighbors_slider = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Number of neighbours (n_neighbors)", interactive=True) | |
warning_box = gr.Markdown("", visible=False) | |
column_checkboxes = gr.CheckboxGroup(label="Select columns for impute", choices=[], value=[]) | |
file_input.change( | |
fn=get_imputable_columns, | |
inputs=file_input, | |
outputs=[column_checkboxes, warning_box], | |
queue=False | |
) | |
impute_button = gr.Button("Run imputation") | |
with gr.Row(): | |
download_output = gr.File(label="Download imputed CSV") | |
with gr.Row(): | |
replacement_display = gr.DataFrame(label="Replacement summary") | |
with gr.Row(): | |
stats_display = gr.DataFrame(label="Before/after imputation statistics") | |
impute_button.click( | |
fn=impute_and_analyze_data, | |
inputs=[file_input, column_checkboxes, n_neighbors_slider], | |
outputs=[download_output, stats_display, replacement_display, warning_box] | |
) | |
app.launch(debug=True) | |