Spaces:

isurulkh
/

Data-Visualization-and-Model-Training-Web-App-with-Streamlit

Running

App Files Files Community

Data-Visualization-and-Model-Training-Web-App-with-Streamlit / app.py

isurulkh

Upload 2 files

3d16f39 verified 12 months ago

raw

history blame

6.06 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.express as px
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression, Ridge, Lasso
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error
	from sklearn.impute import SimpleImputer
	from sklearn.pipeline import make_pipeline
	from pandas.api.types import is_string_dtype

	# Set the page configuration
	st.set_page_config(page_title="Data Visualizer & Model Trainer", layout="wide", page_icon="📊", initial_sidebar_state='expanded')

	# Title and Introduction
	st.title("📊 Data Visualizer & Model Trainer - Web App")
	st.markdown("""
	This app allows you to upload your data, visualize it through various plots, analyze descriptive statistics, and run multiple regression models to find the best one based on MSE.
	""")

	# Sidebar for uploading data
	with st.sidebar:
	st.header("Upload and Select Data")
	uploaded_file = st.file_uploader("Choose a CSV file", type=['csv'])
	if uploaded_file is not None:
	df = pd.read_csv(uploaded_file)
	st.success("File successfully uploaded!")

	if uploaded_file is not None:
	# Data Preview Section
	st.subheader("Data Preview")
	preview_rows = st.slider("How many rows to display?", 5, 100, 20)
	st.dataframe(df.head(preview_rows))

	# Preprocess the dataset: Convert dates to numerical features and encode categorical variables
	for col in df.columns:
	if is_string_dtype(df[col]):
	try:
	df[col] = pd.to_datetime(df[col])
	df[f"{col}_year"] = df[col].dt.year
	df[f"{col}_month"] = df[col].dt.month
	df[f"{col}_day"] = df[col].dt.day
	df.drop(columns=[col], inplace=True)
	except Exception:
	df = pd.get_dummies(df, columns=[col], drop_first=True)

	# Data Analysis Section
	st.subheader("Data Analysis Tasks")
	analysis_options = ["Descriptive Statistics", "Missing Values Analysis", "Correlation Heatmap"]
	selected_analysis = st.multiselect("Select analysis tasks you want to perform:", analysis_options)

	if "Descriptive Statistics" in selected_analysis:
	st.write("### Descriptive Statistics")
	st.write(df.describe())

	if "Missing Values Analysis" in selected_analysis:
	st.write("### Missing Values Analysis")
	missing_values = df.isnull().sum()
	missing_values = missing_values[missing_values > 0]
	st.write(missing_values)

	if "Correlation Heatmap" in selected_analysis:
	st.write("### Correlation Heatmap")
	numeric_df = df.select_dtypes(include=[np.number])
	plt.figure(figsize=(10, 7))
	sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
	st.pyplot(plt)

	# Data Visualization Section
	st.subheader("Data Visualization")
	plot_types = ["Line Plot", "Bar Plot", "Scatter Plot", "Histogram", "Interactive Plot", "Box Plot", "Pair Plot"]
	selected_plots = st.multiselect("Choose plot types:", plot_types)

	if selected_plots:
	columns = df.columns.tolist()
	x_axis = st.selectbox("Select the X-axis", options=columns, index=0)
	y_axis_options = ['None'] + columns
	y_axis = st.selectbox("Select the Y-axis", options=y_axis_options, index=0)

	for plot_type in selected_plots:
	st.write(f"### {plot_type}")
	if plot_type == "Interactive Plot":
	fig = px.scatter(df, x=x_axis, y=y_axis if y_axis != 'None' else None, title=f"{y_axis} vs {x_axis}")
	st.plotly_chart(fig, use_container_width=True)
	elif plot_type == "Pair Plot":
	sns.pairplot(df)
	st.pyplot(plt)
	else:
	fig, ax = plt.subplots()
	if plot_type == "Line Plot" and y_axis != 'None':
	sns.lineplot(x=x_axis, y=y_axis, data=df, ax=ax)
	elif plot_type == "Bar Plot" and y_axis != 'None':
	sns.barplot(x=x_axis, y=y_axis, data=df, ax=ax)
	elif plot_type == "Scatter Plot" and y_axis != 'None':
	sns.scatterplot(x=x_axis, y=y_axis, data=df, ax=ax)
	elif plot_type == "Histogram":
	sns.histplot(data=df, x=x_axis, kde=True, ax=ax)
	elif plot_type == "Box Plot" and y_axis != 'None':
	sns.boxplot(x=x_axis, y=y_axis, data=df, ax=ax)
	st.pyplot(fig)

	# Model Training and Selection
	st.subheader("Model Training & Selection")
	target_column = st.selectbox("Select Target Column", options=df.columns)

	if st.button("Train Models and Select Best"):
	feature_columns = [col for col in df.columns if col != target_column]

	X = df[feature_columns]
	y = df[target_column]
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	models = {
	'Linear Regression': make_pipeline(SimpleImputer(strategy='mean'), LinearRegression()),
	'Ridge Regression': make_pipeline(SimpleImputer(strategy='mean'), Ridge()),
	'Lasso Regression': make_pipeline(SimpleImputer(strategy='mean'), Lasso()),
	'Random Forest': make_pipeline(SimpleImputer(strategy='mean'), RandomForestRegressor(random_state=42))
	}

	mse_scores = {}
	for name, model in models.items():
	model.fit(X_train, y_train)
	predictions = model.predict(X_test)
	mse_scores[name] = mean_squared_error(y_test, predictions)

	best_model = min(mse_scores, key=mse_scores.get)
	st.write(f"Best Model: {best_model} with MSE: {mse_scores[best_model]}")

	# Display MSE scores for all models
	for model_name, mse in mse_scores.items():
	st.write(f"{model_name}: MSE = {mse}")

	# Footer
	st.markdown("---")
	st.markdown("Developed by Isuru Lakshan ekanayaka - A versatile data visualization and model training app built with Streamlit.")