|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import plotly.express as px |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.linear_model import LinearRegression, Ridge, Lasso |
|
from sklearn.ensemble import RandomForestRegressor |
|
from sklearn.metrics import mean_squared_error |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.pipeline import make_pipeline |
|
from pandas.api.types import is_string_dtype |
|
|
|
|
|
st.set_page_config(page_title="Data Visualizer & Model Trainer", layout="wide", page_icon="π", initial_sidebar_state='expanded') |
|
|
|
|
|
st.title("π Data Visualizer & Model Trainer - Web App") |
|
st.markdown(""" |
|
This app allows you to upload your data, visualize it through various plots, analyze descriptive statistics, and run multiple regression models to find the best one based on MSE. |
|
""") |
|
|
|
|
|
with st.sidebar: |
|
st.header("Upload and Select Data") |
|
uploaded_file = st.file_uploader("Choose a CSV file", type=['csv']) |
|
if uploaded_file is not None: |
|
df = pd.read_csv(uploaded_file) |
|
st.success("File successfully uploaded!") |
|
|
|
if uploaded_file is not None: |
|
|
|
st.subheader("Data Preview") |
|
preview_rows = st.slider("How many rows to display?", 5, 100, 20) |
|
st.dataframe(df.head(preview_rows)) |
|
|
|
|
|
for col in df.columns: |
|
if is_string_dtype(df[col]): |
|
try: |
|
df[col] = pd.to_datetime(df[col]) |
|
df[f"{col}_year"] = df[col].dt.year |
|
df[f"{col}_month"] = df[col].dt.month |
|
df[f"{col}_day"] = df[col].dt.day |
|
df.drop(columns=[col], inplace=True) |
|
except Exception: |
|
df = pd.get_dummies(df, columns=[col], drop_first=True) |
|
|
|
|
|
st.subheader("Data Analysis Tasks") |
|
analysis_options = ["Descriptive Statistics", "Missing Values Analysis", "Correlation Heatmap"] |
|
selected_analysis = st.multiselect("Select analysis tasks you want to perform:", analysis_options) |
|
|
|
if "Descriptive Statistics" in selected_analysis: |
|
st.write("### Descriptive Statistics") |
|
st.write(df.describe()) |
|
|
|
if "Missing Values Analysis" in selected_analysis: |
|
st.write("### Missing Values Analysis") |
|
missing_values = df.isnull().sum() |
|
missing_values = missing_values[missing_values > 0] |
|
st.write(missing_values) |
|
|
|
if "Correlation Heatmap" in selected_analysis: |
|
st.write("### Correlation Heatmap") |
|
numeric_df = df.select_dtypes(include=[np.number]) |
|
plt.figure(figsize=(10, 7)) |
|
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm') |
|
st.pyplot(plt) |
|
|
|
|
|
st.subheader("Data Visualization") |
|
plot_types = ["Line Plot", "Bar Plot", "Scatter Plot", "Histogram", "Interactive Plot", "Box Plot", "Pair Plot"] |
|
selected_plots = st.multiselect("Choose plot types:", plot_types) |
|
|
|
if selected_plots: |
|
columns = df.columns.tolist() |
|
x_axis = st.selectbox("Select the X-axis", options=columns, index=0) |
|
y_axis_options = ['None'] + columns |
|
y_axis = st.selectbox("Select the Y-axis", options=y_axis_options, index=0) |
|
|
|
for plot_type in selected_plots: |
|
st.write(f"### {plot_type}") |
|
if plot_type == "Interactive Plot": |
|
fig = px.scatter(df, x=x_axis, y=y_axis if y_axis != 'None' else None, title=f"{y_axis} vs {x_axis}") |
|
st.plotly_chart(fig, use_container_width=True) |
|
elif plot_type == "Pair Plot": |
|
sns.pairplot(df) |
|
st.pyplot(plt) |
|
else: |
|
fig, ax = plt.subplots() |
|
if plot_type == "Line Plot" and y_axis != 'None': |
|
sns.lineplot(x=x_axis, y=y_axis, data=df, ax=ax) |
|
elif plot_type == "Bar Plot" and y_axis != 'None': |
|
sns.barplot(x=x_axis, y=y_axis, data=df, ax=ax) |
|
elif plot_type == "Scatter Plot" and y_axis != 'None': |
|
sns.scatterplot(x=x_axis, y=y_axis, data=df, ax=ax) |
|
elif plot_type == "Histogram": |
|
sns.histplot(data=df, x=x_axis, kde=True, ax=ax) |
|
elif plot_type == "Box Plot" and y_axis != 'None': |
|
sns.boxplot(x=x_axis, y=y_axis, data=df, ax=ax) |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader("Model Training & Selection") |
|
target_column = st.selectbox("Select Target Column", options=df.columns) |
|
|
|
if st.button("Train Models and Select Best"): |
|
feature_columns = [col for col in df.columns if col != target_column] |
|
|
|
X = df[feature_columns] |
|
y = df[target_column] |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
models = { |
|
'Linear Regression': make_pipeline(SimpleImputer(strategy='mean'), LinearRegression()), |
|
'Ridge Regression': make_pipeline(SimpleImputer(strategy='mean'), Ridge()), |
|
'Lasso Regression': make_pipeline(SimpleImputer(strategy='mean'), Lasso()), |
|
'Random Forest': make_pipeline(SimpleImputer(strategy='mean'), RandomForestRegressor(random_state=42)) |
|
} |
|
|
|
mse_scores = {} |
|
for name, model in models.items(): |
|
model.fit(X_train, y_train) |
|
predictions = model.predict(X_test) |
|
mse_scores[name] = mean_squared_error(y_test, predictions) |
|
|
|
best_model = min(mse_scores, key=mse_scores.get) |
|
st.write(f"Best Model: {best_model} with MSE: {mse_scores[best_model]}") |
|
|
|
|
|
for model_name, mse in mse_scores.items(): |
|
st.write(f"{model_name}: MSE = {mse}") |
|
|
|
|
|
st.markdown("---") |
|
st.markdown("Developed by Isuru Lakshan ekanayaka - A versatile data visualization and model training app built with Streamlit.") |
|
|