import streamlit as st import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.svm import SVC, SVR from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score import numpy as np import matplotlib.pyplot as plt import seaborn as sns from io import BytesIO # Streamlit app title st.title("Model Training with Outlier Removal, Metrics, and Correlation Heatmap") # File uploader uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"]) if uploaded_file is not None: # Read the uploaded CSV file df = pd.read_csv(uploaded_file) # Display the dataset st.write("Dataset:") st.dataframe(df) # Convert categorical (str) data to numerical st.write("Converting Categorical Columns to Numerical Values:") label_encoder = LabelEncoder() for col in df.columns: if df[col].dtype == 'object' or len(df[col].unique()) <= 10: st.write(f"Encoding Column: **{col}**") df[col] = label_encoder.fit_transform(df[col]) # Display the dataset after conversion st.write("Dataset After Conversion:") st.dataframe(df) # Handle missing values st.write("Handling Missing (Null) Values:") fill_method = st.selectbox("Choose how to handle missing values", ["Drop rows", "Fill with mean/median"]) if fill_method == "Drop rows": df = df.dropna() elif fill_method == "Fill with mean/median": for col in df.columns: if df[col].dtype in ['float64', 'int64']: df[col].fillna(df[col].mean(), inplace=True) else: df[col].fillna(df[col].mode()[0], inplace=True) # Remove outliers using the IQR method st.write("Removing Outliers Using IQR:") def remove_outliers_iqr(data, column): Q1 = data[column].quantile(0.25) Q3 = data[column].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)] numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns for col in numeric_cols: original_count = len(df) df = remove_outliers_iqr(df, col) st.write(f"Removed outliers from **{col}**: {original_count - len(df)} rows removed.") # Capping Extreme Values (based on 5% and 95% percentiles) st.write("Handling Extreme Values (Capping):") def cap_extreme_values(dataframe): for col in dataframe.select_dtypes(include=[np.number]).columns: lower_limit = dataframe[col].quantile(0.05) upper_limit = dataframe[col].quantile(0.95) dataframe[col] = np.clip(dataframe[col], lower_limit, upper_limit) return dataframe df = cap_extreme_values(df) # Display dataset after cleaning st.write("Dataset After Outlier Removal and Capping Extreme Values:") st.dataframe(df) # Add clean data download option st.subheader("Download Cleaned Dataset") st.download_button( label="Download Cleaned Dataset (CSV)", data=df.to_csv(index=False), file_name="cleaned_dataset.csv", mime="text/csv" ) # Correlation Heatmap st.subheader("Correlation Heatmap") corr = df.corr() plt.figure(figsize=(10, 8)) sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", cbar=True) st.pyplot(plt) # Save heatmap as PNG buf = BytesIO() plt.savefig(buf, format="png") buf.seek(0) st.download_button( label="Download Correlation Heatmap as PNG", data=buf, file_name="correlation_heatmap.png", mime="image/png" ) # Highlight highly correlated pairs st.subheader("Highly Correlated Features") high_corr = corr.abs().unstack().sort_values(ascending=False).drop_duplicates() high_corr = high_corr[high_corr.index.get_level_values(0) != high_corr.index.get_level_values(1)] high_corr_df = pd.DataFrame(high_corr, columns=["Correlation"]) st.dataframe(high_corr_df) # Download correlation table as CSV st.download_button( label="Download Correlation Table (CSV)", data=high_corr_df.to_csv(index=True), file_name="correlation_table.csv", mime="text/csv" ) # Select target variable target = st.selectbox("Select Target Variable", df.columns) features = [col for col in df.columns if col != target] X = df[features] y = df[target] if len(y.unique()) > 1: # Ensure the target variable has at least two unique classes/values if y.dtype == 'object' or len(y.unique()) <= 10: # Classification st.subheader("Classification Model Training") classifiers = { 'Logistic Regression': LogisticRegression(max_iter=2000), 'Decision Tree': DecisionTreeClassifier(), 'Random Forest': RandomForestClassifier(), 'Support Vector Machine (SVM)': SVC(), 'K-Nearest Neighbors (k-NN)': KNeighborsClassifier(), 'Naive Bayes': GaussianNB() } metrics = [] train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=1-train_size, stratify=y, random_state=42 ) for name, classifier in classifiers.items(): classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) metrics.append({ 'Model': name, 'Accuracy': round(accuracy_score(y_test, y_pred), 2), 'Precision': round(precision_score(y_test, y_pred, zero_division=1, average='macro'), 2), 'Recall': round(recall_score(y_test, y_pred, zero_division=1, average='macro'), 2), 'F1-Score': round(f1_score(y_test, y_pred, zero_division=1, average='macro'), 2) }) metrics_df = pd.DataFrame(metrics) st.subheader("Classification Model Performance Metrics") st.dataframe(metrics_df) # Save metrics as PNG (table form) fig, ax = plt.subplots(figsize=(8, 4)) ax.axis('tight') ax.axis('off') table = plt.table(cellText=metrics_df.values, colLabels=metrics_df.columns, cellLoc='center', loc='center') table.auto_set_font_size(False) table.set_fontsize(10) table.auto_set_column_width(col=list(range(len(metrics_df.columns)))) buf = BytesIO() fig.savefig(buf, format="png") buf.seek(0) st.download_button( label="Download Classification Metrics Table as PNG", data=buf, file_name="classification_metrics_table.png", mime="image/png" ) # Visualization (Bar Graphs for Classification) st.subheader("Classification Model Performance Metrics Graph") metrics_df.set_index('Model', inplace=True) ax = metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45) plt.title("Classification Models - Performance Metrics") plt.ylabel("Scores") plt.xlabel("Models") st.pyplot(plt) # Download button for the bar graph buf = BytesIO() ax.figure.savefig(buf, format="png") buf.seek(0) st.download_button( label="Download Classification Performance Graph as PNG", data=buf, file_name="classification_performance_graph.png", mime="image/png" ) else: # Regression st.subheader("Regression Model Training") regressors = { 'Linear Regression': LinearRegression(), 'Decision Tree Regressor': DecisionTreeRegressor(), 'Random Forest Regressor': RandomForestRegressor(), 'Support Vector Regressor (SVR)': SVR(), 'K-Nearest Neighbors Regressor (k-NN)': KNeighborsRegressor() } regression_metrics = [] train_size = st.slider("Select Training Size", min_value=0.1, max_value=0.9, value=0.8) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=1-train_size, random_state=42 ) for name, regressor in regressors.items(): regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) regression_metrics.append({ 'Model': name, 'Mean Squared Error (MSE)': round(mean_squared_error(y_test, y_pred), 2), 'Mean Absolute Error (MAE)': round(mean_absolute_error(y_test, y_pred), 2), 'R² Score': round(r2_score(y_test, y_pred), 2) }) regression_metrics_df = pd.DataFrame(regression_metrics) st.subheader("Regression Model Performance Metrics") st.dataframe(regression_metrics_df) # Save metrics as PNG (table form) fig, ax = plt.subplots(figsize=(8, 4)) ax.axis('tight') ax.axis('off') table = plt.table(cellText=regression_metrics_df.values, colLabels=regression_metrics_df.columns, cellLoc='center', loc='center') table.auto_set_font_size(False) table.set_fontsize(10) table.auto_set_column_width(col=list(range(len(regression_metrics_df.columns)))) buf = BytesIO() fig.savefig(buf, format="png") buf.seek(0) st.download_button( label="Download Regression Metrics Table as PNG", data=buf, file_name="regression_metrics_table.png", mime="image/png" ) # Visualization (Bar Graphs for Regression) st.subheader("Regression Model Performance Metrics Graph") regression_metrics_df.set_index('Model', inplace=True) ax = regression_metrics_df.plot(kind='bar', figsize=(10, 6), colormap='coolwarm', rot=45) plt.title("Regression Models - Performance Metrics") plt.ylabel("Scores") plt.xlabel("Models") st.pyplot(plt) # Download button for the bar graph buf = BytesIO() ax.figure.savefig(buf, format="png") buf.seek(0) st.download_button( label="Download Regression Performance Graph as PNG", data=buf, file_name="regression_performance_graph.png", mime="image/png" ) else: st.error("The target variable must contain at least two unique values for classification or regression. Please check your dataset.")