Upload 2 files
Browse files- app.py +138 -0
- requirements.txt +7 -0
app.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import plotly.express as px
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.linear_model import LinearRegression, Ridge, Lasso
|
9 |
+
from sklearn.ensemble import RandomForestRegressor
|
10 |
+
from sklearn.metrics import mean_squared_error
|
11 |
+
from sklearn.impute import SimpleImputer
|
12 |
+
from sklearn.pipeline import make_pipeline
|
13 |
+
from pandas.api.types import is_string_dtype
|
14 |
+
|
15 |
+
# Set the page configuration
|
16 |
+
st.set_page_config(page_title="Data Visualizer & Model Trainer", layout="wide", page_icon="π", initial_sidebar_state='expanded')
|
17 |
+
|
18 |
+
# Title and Introduction
|
19 |
+
st.title("π Data Visualizer & Model Trainer - Web App")
|
20 |
+
st.markdown("""
|
21 |
+
This app allows you to upload your data, visualize it through various plots, analyze descriptive statistics, and run multiple regression models to find the best one based on MSE.
|
22 |
+
""")
|
23 |
+
|
24 |
+
# Sidebar for uploading data
|
25 |
+
with st.sidebar:
|
26 |
+
st.header("Upload and Select Data")
|
27 |
+
uploaded_file = st.file_uploader("Choose a CSV file", type=['csv'])
|
28 |
+
if uploaded_file is not None:
|
29 |
+
df = pd.read_csv(uploaded_file)
|
30 |
+
st.success("File successfully uploaded!")
|
31 |
+
|
32 |
+
if uploaded_file is not None:
|
33 |
+
# Data Preview Section
|
34 |
+
st.subheader("Data Preview")
|
35 |
+
preview_rows = st.slider("How many rows to display?", 5, 100, 20)
|
36 |
+
st.dataframe(df.head(preview_rows))
|
37 |
+
|
38 |
+
# Preprocess the dataset: Convert dates to numerical features and encode categorical variables
|
39 |
+
for col in df.columns:
|
40 |
+
if is_string_dtype(df[col]):
|
41 |
+
try:
|
42 |
+
df[col] = pd.to_datetime(df[col])
|
43 |
+
df[f"{col}_year"] = df[col].dt.year
|
44 |
+
df[f"{col}_month"] = df[col].dt.month
|
45 |
+
df[f"{col}_day"] = df[col].dt.day
|
46 |
+
df.drop(columns=[col], inplace=True)
|
47 |
+
except Exception:
|
48 |
+
df = pd.get_dummies(df, columns=[col], drop_first=True)
|
49 |
+
|
50 |
+
# Data Analysis Section
|
51 |
+
st.subheader("Data Analysis Tasks")
|
52 |
+
analysis_options = ["Descriptive Statistics", "Missing Values Analysis", "Correlation Heatmap"]
|
53 |
+
selected_analysis = st.multiselect("Select analysis tasks you want to perform:", analysis_options)
|
54 |
+
|
55 |
+
if "Descriptive Statistics" in selected_analysis:
|
56 |
+
st.write("### Descriptive Statistics")
|
57 |
+
st.write(df.describe())
|
58 |
+
|
59 |
+
if "Missing Values Analysis" in selected_analysis:
|
60 |
+
st.write("### Missing Values Analysis")
|
61 |
+
missing_values = df.isnull().sum()
|
62 |
+
missing_values = missing_values[missing_values > 0]
|
63 |
+
st.write(missing_values)
|
64 |
+
|
65 |
+
if "Correlation Heatmap" in selected_analysis:
|
66 |
+
st.write("### Correlation Heatmap")
|
67 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
68 |
+
plt.figure(figsize=(10, 7))
|
69 |
+
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
|
70 |
+
st.pyplot(plt)
|
71 |
+
|
72 |
+
# Data Visualization Section
|
73 |
+
st.subheader("Data Visualization")
|
74 |
+
plot_types = ["Line Plot", "Bar Plot", "Scatter Plot", "Histogram", "Interactive Plot", "Box Plot", "Pair Plot"]
|
75 |
+
selected_plots = st.multiselect("Choose plot types:", plot_types)
|
76 |
+
|
77 |
+
if selected_plots:
|
78 |
+
columns = df.columns.tolist()
|
79 |
+
x_axis = st.selectbox("Select the X-axis", options=columns, index=0)
|
80 |
+
y_axis_options = ['None'] + columns
|
81 |
+
y_axis = st.selectbox("Select the Y-axis", options=y_axis_options, index=0)
|
82 |
+
|
83 |
+
for plot_type in selected_plots:
|
84 |
+
st.write(f"### {plot_type}")
|
85 |
+
if plot_type == "Interactive Plot":
|
86 |
+
fig = px.scatter(df, x=x_axis, y=y_axis if y_axis != 'None' else None, title=f"{y_axis} vs {x_axis}")
|
87 |
+
st.plotly_chart(fig, use_container_width=True)
|
88 |
+
elif plot_type == "Pair Plot":
|
89 |
+
sns.pairplot(df)
|
90 |
+
st.pyplot(plt)
|
91 |
+
else:
|
92 |
+
fig, ax = plt.subplots()
|
93 |
+
if plot_type == "Line Plot" and y_axis != 'None':
|
94 |
+
sns.lineplot(x=x_axis, y=y_axis, data=df, ax=ax)
|
95 |
+
elif plot_type == "Bar Plot" and y_axis != 'None':
|
96 |
+
sns.barplot(x=x_axis, y=y_axis, data=df, ax=ax)
|
97 |
+
elif plot_type == "Scatter Plot" and y_axis != 'None':
|
98 |
+
sns.scatterplot(x=x_axis, y=y_axis, data=df, ax=ax)
|
99 |
+
elif plot_type == "Histogram":
|
100 |
+
sns.histplot(data=df, x=x_axis, kde=True, ax=ax)
|
101 |
+
elif plot_type == "Box Plot" and y_axis != 'None':
|
102 |
+
sns.boxplot(x=x_axis, y=y_axis, data=df, ax=ax)
|
103 |
+
st.pyplot(fig)
|
104 |
+
|
105 |
+
# Model Training and Selection
|
106 |
+
st.subheader("Model Training & Selection")
|
107 |
+
target_column = st.selectbox("Select Target Column", options=df.columns)
|
108 |
+
|
109 |
+
if st.button("Train Models and Select Best"):
|
110 |
+
feature_columns = [col for col in df.columns if col != target_column]
|
111 |
+
|
112 |
+
X = df[feature_columns]
|
113 |
+
y = df[target_column]
|
114 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
115 |
+
|
116 |
+
models = {
|
117 |
+
'Linear Regression': make_pipeline(SimpleImputer(strategy='mean'), LinearRegression()),
|
118 |
+
'Ridge Regression': make_pipeline(SimpleImputer(strategy='mean'), Ridge()),
|
119 |
+
'Lasso Regression': make_pipeline(SimpleImputer(strategy='mean'), Lasso()),
|
120 |
+
'Random Forest': make_pipeline(SimpleImputer(strategy='mean'), RandomForestRegressor(random_state=42))
|
121 |
+
}
|
122 |
+
|
123 |
+
mse_scores = {}
|
124 |
+
for name, model in models.items():
|
125 |
+
model.fit(X_train, y_train)
|
126 |
+
predictions = model.predict(X_test)
|
127 |
+
mse_scores[name] = mean_squared_error(y_test, predictions)
|
128 |
+
|
129 |
+
best_model = min(mse_scores, key=mse_scores.get)
|
130 |
+
st.write(f"Best Model: {best_model} with MSE: {mse_scores[best_model]}")
|
131 |
+
|
132 |
+
# Display MSE scores for all models
|
133 |
+
for model_name, mse in mse_scores.items():
|
134 |
+
st.write(f"{model_name}: MSE = {mse}")
|
135 |
+
|
136 |
+
# Footer
|
137 |
+
st.markdown("---")
|
138 |
+
st.markdown("Developed by Isuru Lakshan ekanayaka - A versatile data visualization and model training app built with Streamlit.")
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
numpy
|
3 |
+
pandas
|
4 |
+
matplotlib
|
5 |
+
seaborn
|
6 |
+
plotly
|
7 |
+
scikit-learn
|