isurulkh commited on
Commit
3d16f39
β€’
1 Parent(s): 22f4f19

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +138 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import plotly.express as px
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.linear_model import LinearRegression, Ridge, Lasso
9
+ from sklearn.ensemble import RandomForestRegressor
10
+ from sklearn.metrics import mean_squared_error
11
+ from sklearn.impute import SimpleImputer
12
+ from sklearn.pipeline import make_pipeline
13
+ from pandas.api.types import is_string_dtype
14
+
15
+ # Set the page configuration
16
+ st.set_page_config(page_title="Data Visualizer & Model Trainer", layout="wide", page_icon="πŸ“Š", initial_sidebar_state='expanded')
17
+
18
+ # Title and Introduction
19
+ st.title("πŸ“Š Data Visualizer & Model Trainer - Web App")
20
+ st.markdown("""
21
+ This app allows you to upload your data, visualize it through various plots, analyze descriptive statistics, and run multiple regression models to find the best one based on MSE.
22
+ """)
23
+
24
+ # Sidebar for uploading data
25
+ with st.sidebar:
26
+ st.header("Upload and Select Data")
27
+ uploaded_file = st.file_uploader("Choose a CSV file", type=['csv'])
28
+ if uploaded_file is not None:
29
+ df = pd.read_csv(uploaded_file)
30
+ st.success("File successfully uploaded!")
31
+
32
+ if uploaded_file is not None:
33
+ # Data Preview Section
34
+ st.subheader("Data Preview")
35
+ preview_rows = st.slider("How many rows to display?", 5, 100, 20)
36
+ st.dataframe(df.head(preview_rows))
37
+
38
+ # Preprocess the dataset: Convert dates to numerical features and encode categorical variables
39
+ for col in df.columns:
40
+ if is_string_dtype(df[col]):
41
+ try:
42
+ df[col] = pd.to_datetime(df[col])
43
+ df[f"{col}_year"] = df[col].dt.year
44
+ df[f"{col}_month"] = df[col].dt.month
45
+ df[f"{col}_day"] = df[col].dt.day
46
+ df.drop(columns=[col], inplace=True)
47
+ except Exception:
48
+ df = pd.get_dummies(df, columns=[col], drop_first=True)
49
+
50
+ # Data Analysis Section
51
+ st.subheader("Data Analysis Tasks")
52
+ analysis_options = ["Descriptive Statistics", "Missing Values Analysis", "Correlation Heatmap"]
53
+ selected_analysis = st.multiselect("Select analysis tasks you want to perform:", analysis_options)
54
+
55
+ if "Descriptive Statistics" in selected_analysis:
56
+ st.write("### Descriptive Statistics")
57
+ st.write(df.describe())
58
+
59
+ if "Missing Values Analysis" in selected_analysis:
60
+ st.write("### Missing Values Analysis")
61
+ missing_values = df.isnull().sum()
62
+ missing_values = missing_values[missing_values > 0]
63
+ st.write(missing_values)
64
+
65
+ if "Correlation Heatmap" in selected_analysis:
66
+ st.write("### Correlation Heatmap")
67
+ numeric_df = df.select_dtypes(include=[np.number])
68
+ plt.figure(figsize=(10, 7))
69
+ sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
70
+ st.pyplot(plt)
71
+
72
+ # Data Visualization Section
73
+ st.subheader("Data Visualization")
74
+ plot_types = ["Line Plot", "Bar Plot", "Scatter Plot", "Histogram", "Interactive Plot", "Box Plot", "Pair Plot"]
75
+ selected_plots = st.multiselect("Choose plot types:", plot_types)
76
+
77
+ if selected_plots:
78
+ columns = df.columns.tolist()
79
+ x_axis = st.selectbox("Select the X-axis", options=columns, index=0)
80
+ y_axis_options = ['None'] + columns
81
+ y_axis = st.selectbox("Select the Y-axis", options=y_axis_options, index=0)
82
+
83
+ for plot_type in selected_plots:
84
+ st.write(f"### {plot_type}")
85
+ if plot_type == "Interactive Plot":
86
+ fig = px.scatter(df, x=x_axis, y=y_axis if y_axis != 'None' else None, title=f"{y_axis} vs {x_axis}")
87
+ st.plotly_chart(fig, use_container_width=True)
88
+ elif plot_type == "Pair Plot":
89
+ sns.pairplot(df)
90
+ st.pyplot(plt)
91
+ else:
92
+ fig, ax = plt.subplots()
93
+ if plot_type == "Line Plot" and y_axis != 'None':
94
+ sns.lineplot(x=x_axis, y=y_axis, data=df, ax=ax)
95
+ elif plot_type == "Bar Plot" and y_axis != 'None':
96
+ sns.barplot(x=x_axis, y=y_axis, data=df, ax=ax)
97
+ elif plot_type == "Scatter Plot" and y_axis != 'None':
98
+ sns.scatterplot(x=x_axis, y=y_axis, data=df, ax=ax)
99
+ elif plot_type == "Histogram":
100
+ sns.histplot(data=df, x=x_axis, kde=True, ax=ax)
101
+ elif plot_type == "Box Plot" and y_axis != 'None':
102
+ sns.boxplot(x=x_axis, y=y_axis, data=df, ax=ax)
103
+ st.pyplot(fig)
104
+
105
+ # Model Training and Selection
106
+ st.subheader("Model Training & Selection")
107
+ target_column = st.selectbox("Select Target Column", options=df.columns)
108
+
109
+ if st.button("Train Models and Select Best"):
110
+ feature_columns = [col for col in df.columns if col != target_column]
111
+
112
+ X = df[feature_columns]
113
+ y = df[target_column]
114
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
115
+
116
+ models = {
117
+ 'Linear Regression': make_pipeline(SimpleImputer(strategy='mean'), LinearRegression()),
118
+ 'Ridge Regression': make_pipeline(SimpleImputer(strategy='mean'), Ridge()),
119
+ 'Lasso Regression': make_pipeline(SimpleImputer(strategy='mean'), Lasso()),
120
+ 'Random Forest': make_pipeline(SimpleImputer(strategy='mean'), RandomForestRegressor(random_state=42))
121
+ }
122
+
123
+ mse_scores = {}
124
+ for name, model in models.items():
125
+ model.fit(X_train, y_train)
126
+ predictions = model.predict(X_test)
127
+ mse_scores[name] = mean_squared_error(y_test, predictions)
128
+
129
+ best_model = min(mse_scores, key=mse_scores.get)
130
+ st.write(f"Best Model: {best_model} with MSE: {mse_scores[best_model]}")
131
+
132
+ # Display MSE scores for all models
133
+ for model_name, mse in mse_scores.items():
134
+ st.write(f"{model_name}: MSE = {mse}")
135
+
136
+ # Footer
137
+ st.markdown("---")
138
+ st.markdown("Developed by Isuru Lakshan ekanayaka - A versatile data visualization and model training app built with Streamlit.")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ numpy
3
+ pandas
4
+ matplotlib
5
+ seaborn
6
+ plotly
7
+ scikit-learn