knn_impute / app.py
pvaluedotone's picture
Update app.py
bf1894e verified
import pandas as pd
from sklearn.impute import KNNImputer
import gradio as gr
def get_clean_columns(df):
df.columns = df.columns.str.replace('\xa0', ' ', regex=True).str.strip()
return df
def get_imputable_columns(file_obj):
"""Reads a CSV and returns a list of numeric columns with NaNs eligible for user selection and a warning."""
if file_obj is None:
return gr.update(choices=[], value=[]), gr.update(value="", visible=False)
try:
df = pd.read_csv(file_obj)
df = get_clean_columns(df)
all_numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
cols_with_nan = df[all_numeric_cols].columns[df[all_numeric_cols].isnull().any()].tolist()
if not cols_with_nan:
return gr.update(choices=[], value=[]), gr.update(value="⚠️ No numeric columns with missing values found.", visible=True)
return gr.update(choices=cols_with_nan, value=cols_with_nan), gr.update(value="", visible=False)
except Exception as e:
return gr.update(choices=[], value=[]), gr.update(value=f"⚠️ Could not read file: {e}", visible=True)
def impute_and_analyze_data(input_file, selected_columns, n_neighbors=5):
if input_file is None:
raise gr.Error("Please upload a CSV file.")
if not selected_columns:
raise gr.Error("Please select at least one column for imputation.")
try:
df = pd.read_csv(input_file)
df = get_clean_columns(df)
df_original = df.copy()
stats_data = []
replacements_data = []
valid_selected_columns = [
col for col in selected_columns
if col in df.columns and pd.api.types.is_numeric_dtype(df[col])
]
if not valid_selected_columns:
raise gr.Error("No valid numeric columns selected for imputation.")
original_nan_counts = {
col: df_original[col].isnull().sum()
for col in valid_selected_columns
}
for col in valid_selected_columns:
stats_data.append({
'Metric': 'Before Imputation',
'Column': col,
'Mean': df_original[col].mean(),
'Median': df_original[col].median(),
'Std Dev': df_original[col].std(),
'Min': df_original[col].min(),
'Max': df_original[col].max()
})
df_for_imputation = df[valid_selected_columns].copy()
df_for_imputation = df_for_imputation.dropna(axis=1, how='all')
imputation_target_cols = df_for_imputation.columns.tolist()
if not imputation_target_cols:
output_path = f"imputed_data_{input_file.split('/')[-1]}"
df.to_csv(output_path, index=False)
stats_df = pd.DataFrame(stats_data)
replacement_df = pd.DataFrame(replacements_data)
return output_path, stats_df, replacement_df, gr.update(value="⚠️ No columns eligible for imputation after preprocessing.", visible=True)
means = df_for_imputation.mean()
stds = df_for_imputation.std()
df_std = (df_for_imputation - means) / stds
imputer = KNNImputer(n_neighbors=n_neighbors)
df_imputed_std = pd.DataFrame(
imputer.fit_transform(df_std),
columns=imputation_target_cols,
index=df_std.index
)
df_imputed = df_imputed_std * stds + means
df_final = df.copy()
after_nan_counts = {}
for col in imputation_target_cols:
df_final[col] = df_imputed[col]
after_nan_counts[col] = df_final[col].isnull().sum()
replacements = original_nan_counts.get(col, 0) - after_nan_counts.get(col, 0)
if replacements < 0:
replacements = original_nan_counts.get(col, 0)
replacements_data.append({
'Column': col,
'Original NaNs': original_nan_counts.get(col, 0),
'Replacements Made': replacements
})
stats_data.append({
'Metric': 'After Imputation',
'Column': col,
'Mean': df_final[col].mean(),
'Median': df_final[col].median(),
'Std Dev': df_final[col].std(),
'Min': df_final[col].min(),
'Max': df_final[col].max()
})
output_path = f"imputed_data_{input_file.split('/')[-1]}"
df_final.to_csv(output_path, index=False)
stats_df = pd.DataFrame(stats_data)
stats_df[["Mean", "Median", "Std Dev"]] = stats_df[["Mean", "Median", "Std Dev"]].round(4)
stats_df['order_key'] = stats_df['Column'] + stats_df['Metric']
stats_df = stats_df.sort_values(by=['Column', 'Metric'], ascending=[True, False]).drop(columns='order_key')
replacement_df = pd.DataFrame(replacements_data)
return output_path, stats_df, replacement_df, gr.update(value="", visible=False)
except pd.errors.EmptyDataError:
raise gr.Error("The uploaded file is empty.")
except pd.errors.ParserError:
raise gr.Error("Could not parse the CSV file. Please ensure it's a valid CSV.")
except Exception as e:
raise gr.Error(f"An unexpected error occurred: {e}")
# Create .ris citation file
ris_content = """TY - COMP
T1 - Missing value analysis with KNN Impute
AU - Mat Roni, S.
PY - 2025
VL - 1.0
PB - Hugging Face
UR - https://huggingface.co/spaces/pvaluedotone/knn_impute
ER -
"""
with open("citation.ris", "w") as f:
f.write(ris_content)
# --- Gradio Interface ---
with gr.Blocks() as app:
gr.Markdown("# Missing value analysis with KNN Imputer")
gr.Markdown("Upload your CSV dataset, select columns for imputation, and view statistics and imputation details.")
gr.Markdown("**Citation:** Mat Roni, S. (2025). *Missing value analysis with KNN Impute* (version 1.0) [software]. [https://huggingface.co/spaces/pvaluedotone/knn_impute](https://huggingface.co/spaces/pvaluedotone/knn_impute)")
gr.File(value="citation.ris", label="Download citation (.ris)", interactive=False)
with gr.Row():
file_input = gr.File(label="Upload CSV file (.csv)", type="filepath")
n_neighbors_slider = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Number of neighbours (n_neighbors)", interactive=True)
warning_box = gr.Markdown("", visible=False)
column_checkboxes = gr.CheckboxGroup(label="Select columns for impute", choices=[], value=[])
file_input.change(
fn=get_imputable_columns,
inputs=file_input,
outputs=[column_checkboxes, warning_box],
queue=False
)
impute_button = gr.Button("Run imputation")
with gr.Row():
download_output = gr.File(label="Download imputed CSV")
with gr.Row():
replacement_display = gr.DataFrame(label="Replacement summary")
with gr.Row():
stats_display = gr.DataFrame(label="Before/after imputation statistics")
impute_button.click(
fn=impute_and_analyze_data,
inputs=[file_input, column_checkboxes, n_neighbors_slider],
outputs=[download_output, stats_display, replacement_display, warning_box]
)
app.launch(debug=True)