Spaces:

CosmickVisions
/

Data-Vision

Sleeping

App Files Files Community

Data-Vision / app.py

CosmickVisions

Update app.py

42a818a verified 6 months ago

raw

history blame

52 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
	from sklearn.linear_model import LinearRegression, LogisticRegression
	from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
	from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
	from sklearn.svm import SVR, SVC
	from sklearn.decomposition import PCA #Import at top
	from sklearn.metrics import silhouette_score #Import at top
	from sklearn.cluster import DBSCAN #Import at top
	from sklearn.feature_selection import SelectKBest #Import at top
	import joblib #Import at top
	from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
	from sklearn.impute import KNNImputer, SimpleImputer
	from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from ydata_profiling import ProfileReport
	from streamlit_pandas_profiling import st_profile_report
	from io import StringIO
	import joblib
	import requests
	import asyncio
	from io import BytesIO
	import base64
	import time
	from sklearn.cluster import KMeans
	import scipy.stats as stats

	# Configurations
	st.set_page_config(page_title="Executive Insights Pro", layout="wide", page_icon="📈")

	# ----Load Image----
	@st.cache_data(ttl=3600)
	def load_image(image_url):
	"""Loads an image from a URL and returns bytes."""
	try:
	response = requests.get(image_url, stream=True)
	response.raise_for_status()
	return response.content
	except requests.exceptions.RequestException as e:
	st.error(f"Error loading image: {e}")
	return None

	# ----Function to make and convert background to base 64 code-----
	def set_background():
	"""Sets the background image using base64 encoding."""
	image_url = "https://wallpapers.com/images/featured/skrwoybjif4j8l2j.jpg" # Corporate Image
	image_data = load_image(image_url)
	if image_data:
	# Convert bytes to base64
	image_base64 = base64.b64encode(image_data).decode()
	st.markdown(
	f"""
	<style>
	.stApp {{
	background-image: url(data:image/jpeg;base64,{image_base64});
	background-size: cover;
	background-position: center center;
	background-attachment: fixed;
	}}
	</style>
	""",
	unsafe_allow_html=True,
	)
	return

	# Simplified CSS
	def apply_simplified_theme():
	"""Injects simplified CSS to enhance Streamlit's default style."""
	st.markdown(
	"""
	<style>
	[data-testid="stSidebar"] {
	background-color: rgba(52, 73, 94, 0.9);
	color: white;
	}
	.main h1, .main h2, .main h3, .main h4, .main h5, .main h6 {
	color: #5396C6;
	}
	.st-bb, .st-ae, .st-bv {
	background-color: rgba(20, 20, 30, 0.3);
	box-shadow: 1px 1px 5px #4e4e4e;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)
	return

	# Apply background and simplified theme
	set_background()
	apply_simplified_theme()

	def show_loader(message="Loading..."):
	"""Displays an animated loader."""
	st.markdown(
	f"""
	<div style="display: flex; align-items: center; justify-content: center; margin-top: 20px;">
	<div class="loader"></div>
	<span style="margin-left: 10px; color: #00f7ff;">{message}</span>
	</div>
	""",
	unsafe_allow_html=True
	)

	@st.cache_data(ttl=3600, allow_output_mutation=True) #Added allow_output_mutation
	def load_data(uploaded_file):
	"""Load and cache dataset, with file type validation."""
	if uploaded_file is not None:
	file_extension = uploaded_file.name.split(".")[-1].lower()
	mime_type = mimetypes.guess_type(uploaded_file.name)[0]

	max_file_size_mb = 50 # Set a maximum file size (adjust as needed)
	file_size_mb = uploaded_file.size / (1024 * 1024)
	if file_size_mb > max_file_size_mb:
	st.error(f"File size exceeds the limit of {max_file_size_mb} MB.")
	return None


	try: # Wrap file reading in a try...except
	if file_extension == "csv" or mime_type == 'text/csv':
	df = pd.read_csv(uploaded_file)
	return df
	elif file_extension in ["xlsx", "xls"] or mime_type in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']:
	df = pd.read_excel(uploaded_file)
	return df
	else:
	st.error("Unsupported file type. Please upload a CSV or Excel file.")
	return None
	except FileNotFoundError:
	st.error("File not found. Please check the file path.")
	except pd.errors.ParserError: # Catch pandas-specific parsing errors
	st.error("Error parsing the file. Make sure it's a valid CSV or Excel file.")
	except Exception as e:
	st.error(f"An unexpected error occurred: {type(e).__name__} - {str(e)}")
	return None # Handle other potential exceptions

	else:
	return None

	@st.cache_data(ttl=3600)
	def generate_profile(df):
	"""Generate automated EDA report"""
	return ProfileReport(df, minimal=True)

	# Session State Management
	if 'raw_data' not in st.session_state:
	st.session_state.raw_data = None
	if 'cleaned_data' not in st.session_state:
	st.session_state.cleaned_data = None
	if 'train_test' not in st.session_state:
	st.session_state.train_test = {}
	if 'model' not in st.session_state:
	st.session_state.model = None
	if 'preprocessor' not in st.session_state:
	st.session_state.preprocessor = None # to store the column transformer

	# Sidebar Navigation
	st.sidebar.title("🔮 Data Wizard Pro")

	# Apply custom CSS to change text color in the sidebar
	st.markdown(
	"""
	<style>
	[data-testid="stSidebar"] {
	color: #00f7ff; /* Cyan color for sidebar text */
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	# Replace the existing app_mode section with this:
	app_mode = st.sidebar.radio("Navigate", [
	"Data Upload",
	"Smart Cleaning",
	"Advanced EDA",
	"Model Training",
	"Predictions",
	"Visualization Lab",
	"Neural Network Studio" # New option
	])

	# --- Main App Logic ---
	if app_mode == "Data Upload":
	st.title("📤 Data Upload & Initial Analysis")

	# File Upload Section with improved styling
	st.markdown(
	"""
	<style>
	.stFileUploader label {
	color: #00f7ff !important; /* Cyan color for the label */
	}
	.stFileUploader div div div {
	background-color: #141422 !important; /* Dark background */
	color: #e0e0ff !important; /* Light text */
	border: 1px solid #00f7ff !important; /* Cyan border */
	border-radius: 10px;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	uploaded_file = st.file_uploader(
	"Choose a CSV or Excel file", type=["csv", "xlsx"],
	help="Upload your dataset here. Supported formats: CSV, XLSX"
	)

	if uploaded_file:
	df = load_data(uploaded_file)
	if df is not None:
	# only proceed if load_data returned a valid dataframe
	st.session_state.raw_data = df
	st.session_state.cleaned_data = df.copy()

	st.subheader("Data Overview")
	# Data Overview Cards with more context
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Number of Rows", df.shape[0], help="Total number of entries in the dataset.")
	with col2:
	st.metric("Number of Columns", df.shape[1], help="Total number of features in the dataset.")
	with col3:
	num_missing = df.isna().sum().sum()
	st.metric("Total Missing Values", num_missing, help="Total number of missing entries across the entire dataset.")

	# Display Data Types
	st.write("Column Data Types:")
	dtype_counts = df.dtypes.value_counts().to_dict()
	for dtype, count in dtype_counts.items():
	st.write(f"- {dtype}: {count} column(s)")

	# Sample Data Table with improved display
	st.subheader("Sample Data")
	num_rows_preview = st.slider("Number of Rows to Preview", 5, 20, 10, help="Adjust the number of rows displayed in the sample data.")
	st.dataframe(df.head(num_rows_preview), use_container_width=True)

	# Column Statistics
	with st.expander("📊 Column Statistics"):
	for col in df.columns:
	st.subheader(f"Column: {col}")
	st.write(f"Data type: {df[col].dtype}")
	if pd.api.types.is_numeric_dtype(df[col]):
	st.write("Summary Statistics:")
	st.write(df[col].describe())
	else:
	st.write("Value Counts:")
	st.write(df[col].value_counts())

	# Automated EDA Report
	with st.expander("🚀 Automated Data Report"):
	if st.button("Generate Smart Report"):
	show_loader("Generating EDA Report")
	pr = generate_profile(df)
	st_profile_report(pr)

	elif app_mode == "Smart Cleaning":
	st.title("🧼 Intelligent Data Cleaning")
	if st.session_state.raw_data is not None:
	df = st.session_state.cleaned_data

	# Cleaning Toolkit
	col1, col2 = st.columns([1, 3])
	with col1:
	st.subheader("Cleaning Actions")

	clean_action = st.selectbox("Choose Operation", [
	"Handle Missing Values",
	"Clean Text",
	# ... other cleaning operations ...
	])

	if clean_action == "Handle Missing Values":
	columns_with_missing = df.columns[df.isnull().any()].tolist()
	column_to_impute = st.selectbox("Column to Impute", ["All Columns"] + columns_with_missing)

	method = st.selectbox("Imputation Method", [
	"KNN Imputation",
	"Median Fill",
	"Mean Fill",
	"Drop Missing",
	"Constant Value Fill"
	])
	if method == "KNN Imputation":
	knn_neighbors = st.slider("KNN Neighbors", 2, 10, 5)
	elif method == "Constant Value Fill":
	constant_value = st.text_input("Constant Value")

	elif clean_action == "Clean Text":
	text_column = st.selectbox("Text Column", df.select_dtypes(include='object').columns)
	cleaning_operation = st.selectbox("Cleaning Operation", ["Remove Special Characters", "Lowercase", "Uppercase", "Remove Extra Spaces"])
	if cleaning_operation == "Remove Special Characters":
	chars_to_remove = st.text_input("Characters to Remove", r'[^a-zA-Z0-9\s]')

	with col2:
	if st.button("Apply Transformation"):
	with st.spinner("Applying changes..."):
	current_df = df.copy()
	# ... (your data history logic) ...

	if clean_action == "Handle Missing Values":
	if method == "KNN Imputation":
	imputer = KNNImputer(n_neighbors=knn_neighbors)
	if column_to_impute == "All Columns":
	current_df = pd.DataFrame(imputer.fit_transform(current_df), columns=current_df.columns)
	else:
	current_df[[column_to_impute]] = pd.DataFrame(imputer.fit_transform(current_df[[column_to_impute]]), columns=[column_to_impute])
	elif method == "Median Fill":
	if column_to_impute == "All Columns":
	current_df = current_df.fillna(current_df.median())
	else:
	current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].median())
	elif method == "Mean Fill":
	if column_to_impute == "All Columns":
	current_df = current_df.fillna(current_df.mean())
	else:
	current_df[column_to_impute] = current_df[column_to_impute].fillna(current_df[column_to_impute].mean())
	elif method == "Constant Value Fill":
	if column_to_impute == "All Columns":
	current_df = current_df.fillna(constant_value)
	else:
	current_df[column_to_impute] = current_df[column_to_impute].fillna(constant_value)
	else:
	current_df = current_df.dropna()

	elif clean_action == "Clean Text":
	import re #moved here since its only used here to avoid library bloat

	def clean_text(text, operation, chars_to_remove=r'[^a-zA-Z0-9\s]'):
	if operation == "Remove Special Characters":
	text = re.sub(chars_to_remove, '', str(text))
	elif operation == "Lowercase":
	text = str(text).lower()
	elif operation == "Uppercase":
	text = str(text).upper()
	elif operation == "Remove Extra Spaces":
	text = " ".join(str(text).split())
	return text

	current_df[text_column] = current_df[text_column].astype(str).apply(lambda x: clean_text(x, cleaning_operation, chars_to_remove))

	st.session_state.cleaned_data = current_df
	st.success("Transformation applied!")

	elif app_mode == "Advanced EDA":
	st.title("🔍 Advanced Exploratory Analysis")

	if st.session_state.cleaned_data is not None:
	df = st.session_state.cleaned_data.copy()

	# Initialize session state for plot configuration
	if 'plot_config' not in st.session_state:
	st.session_state.plot_config = {
	'plot_type': "Histogram",
	'x_col': df.columns[0] if len(df.columns) > 0 else None,
	'y_col': df.columns[1] if len(df.columns) > 1 else None,
	'z_col': df.columns[2] if len(df.columns) > 2 else None,
	'color_col': None,
	'size_col': None,
	'time_col': None,
	'value_col': None,
	'scatter_matrix_cols': df.select_dtypes(include=np.number).columns.tolist()[:5],
	'color_palette': "#00f7ff",
	'color_continuous_scale': "Viridis",
	'hover_data_cols': [],
	'filter_col': None,
	'filter_options': []
	}

	# Data Filtering Section
	with st.expander("🔎 Data Filtering", expanded=False):
	# Use direct session state assignment for reactivity
	st.session_state.plot_config['filter_col'] = st.selectbox(
	"Filter Column",
	[None] + list(df.columns),
	help="Choose a column to filter the data."
	)

	if st.session_state.plot_config['filter_col']:
	unique_values = df[st.session_state.plot_config['filter_col']].unique()
	st.session_state.plot_config['filter_options'] = st.multiselect(
	"Filter Values",
	unique_values,
	default=unique_values,
	help=f"Select values from '{st.session_state.plot_config['filter_col']}'"
	)
	df = df[df[st.session_state.plot_config['filter_col']].isin(
	st.session_state.plot_config['filter_options']
	)]

	# Visualization Configuration
	st.sidebar.header("📊 Plot Configuration")

	# Plot type selector
	st.session_state.plot_config['plot_type'] = st.sidebar.selectbox(
	"Choose Visualization",
	[
	"Histogram", "Scatter Plot", "Box Plot",
	"Correlation Heatmap", "3D Scatter",
	"Violin Plot", "Time Series", "Scatter Matrix"
	],
	index=0 # Reset to first option when plot type changes
	)

	# Dynamic controls based on plot type
	if st.session_state.plot_config['plot_type'] != "Correlation Heatmap":
	st.session_state.plot_config['x_col'] = st.sidebar.selectbox(
	"X Axis",
	df.columns,
	index=df.columns.get_loc(st.session_state.plot_config['x_col'])
	if st.session_state.plot_config['x_col'] in df.columns else 0
	)

	if st.session_state.plot_config['plot_type'] in ["Scatter Plot", "Box Plot",
	"Violin Plot", "Time Series",
	"3D Scatter", "Histogram"]:
	st.session_state.plot_config['y_col'] = st.sidebar.selectbox(
	"Y Axis",
	df.columns,
	index=df.columns.get_loc(st.session_state.plot_config['y_col'])
	if st.session_state.plot_config['y_col'] in df.columns else 0
	)

	if st.session_state.plot_config['plot_type'] == "3D Scatter":
	st.session_state.plot_config['z_col'] = st.sidebar.selectbox(
	"Z Axis",
	df.columns,
	index=df.columns.get_loc(st.session_state.plot_config['z_col'])
	if st.session_state.plot_config['z_col'] in df.columns else 0
	)
	st.session_state.plot_config['color_col'] = st.sidebar.selectbox(
	"Color by",
	[None] + list(df.columns)
	)

	# Color configuration
	if st.session_state.plot_config['plot_type'] == "Correlation Heatmap":
	st.session_state.plot_config['color_continuous_scale'] = st.sidebar.selectbox(
	"Color Scale",
	['Viridis', 'Plasma', 'Magma', 'Cividis', 'RdBu']
	)
	else:
	st.session_state.plot_config['color_palette'] = st.sidebar.selectbox(
	"Color Palette",
	['#00f7ff', '#ff00ff', '#f70000', '#0000f7']
	)

	# Additional configurations
	if st.session_state.plot_config['plot_type'] == "Scatter Plot":
	st.session_state.plot_config['size_col'] = st.sidebar.selectbox(
	"Size by",
	[None] + list(df.columns)
	)
	st.session_state.plot_config['hover_data_cols'] = st.sidebar.multiselect(
	"Hover Data",
	df.columns
	)

	if st.session_state.plot_config['plot_type'] == "Time Series":
	st.session_state.plot_config['time_col'] = st.sidebar.selectbox(
	"Time Column",
	df.columns
	)
	st.session_state.plot_config['value_col'] = st.sidebar.selectbox(
	"Value Column",
	df.columns
	)

	if st.session_state.plot_config['plot_type'] == "Scatter Matrix":
	st.session_state.plot_config['scatter_matrix_cols'] = st.multiselect(
	"Columns for Scatter Matrix",
	df.select_dtypes(include=np.number).columns,
	default=st.session_state.plot_config['scatter_matrix_cols']
	)

	# Plot generation
	try:
	fig = None
	config = st.session_state.plot_config

	if config['plot_type'] == "Histogram":
	fig = px.histogram(
	df, x=config['x_col'], y=config['y_col'],
	nbins=30, template="plotly_dark",
	color_discrete_sequence=[config['color_palette']]
	)

	elif config['plot_type'] == "Scatter Plot":
	fig = px.scatter(
	df, x=config['x_col'], y=config['y_col'],
	color_discrete_sequence=[config['color_palette']],
	size=config['size_col'],
	hover_data=config['hover_data_cols']
	)

	elif config['plot_type'] == "3D Scatter":
	fig = px.scatter_3d(
	df, x=config['x_col'], y=config['y_col'], z=config['z_col'],
	color=config['color_col'],
	color_discrete_sequence=[config['color_palette']]
	)

	elif config['plot_type'] == "Correlation Heatmap":
	numeric_df = df.select_dtypes(include=np.number)
	if not numeric_df.empty:
	corr = numeric_df.corr()
	fig = px.imshow(
	corr, text_auto=True,
	color_continuous_scale=config['color_continuous_scale']
	)
	else:
	st.warning("No numerical columns found for correlation heatmap.")

	elif config['plot_type'] == "Box Plot":
	fig = px.box(
	df, x=config['x_col'], y=config['y_col'],
	color_discrete_sequence=[config['color_palette']]
	)

	elif config['plot_type'] == "Violin Plot":
	fig = px.violin(
	df, x=config['x_col'], y=config['y_col'],
	box=True, points="all",
	color_discrete_sequence=[config['color_palette']]
	)

	elif config['plot_type'] == "Time Series":
	df = df.sort_values(by=config['time_col'])
	fig = px.line(
	df, x=config['time_col'], y=config['value_col'],
	color_discrete_sequence=[config['color_palette']]
	)

	elif config['plot_type'] == "Scatter Matrix":
	fig = px.scatter_matrix(
	df, dimensions=config['scatter_matrix_cols'],
	color_discrete_sequence=[config['color_palette']]
	)

	if fig:
	st.plotly_chart(fig, use_container_width=True)
	except Exception as e:
	st.error(f"An error occurred while generating the plot: {e}")

	with st.expander("🧪 Hypothesis Testing"):
	test_type = st.selectbox("Select Test Type", ["T-test", "Chi-Squared Test"])

	if test_type == "T-test":
	col1 = st.selectbox("Column 1 (Numeric)", df.select_dtypes(include=np.number).columns)
	col2 = st.selectbox("Column 2 (Categorical)", df.select_dtypes(include='object').columns)
	if st.button("Run T-test"):
	# Example: Split data by category and perform t-test
	try:
	groups = df.groupby(col2)[col1].apply(list)
	if len(groups) == 2:
	t_stat, p_value = stats.ttest_ind(groups.iloc[0], groups.iloc[1])
	st.write(f"T-statistic: {t_stat:.4f}")
	st.write(f"P-value: {p_value:.4f}")
	if p_value < 0.05:
	st.write("Reject the null hypothesis.")
	else:
	st.write("Fail to reject the null hypothesis.")
	else:
	st.write("Select a categorical column with exactly two categories.")
	except Exception as e:
	st.error(f"An error occurred during the T-test: {e}")

	elif app_mode == "Model Training":
	st.title("🚂 Model Training")

	feature_selection_method = st.selectbox("Feature Selection Method", ["None", "SelectKBest"])

	if model_name == "Random Forest":
	param_grid = {
	'n_estimators': st.slider("Number of Estimators", 10, 200, 100, help="Number of trees in the forest."),
	'max_depth': st.slider("Max Depth", 3, 20, 10, help="Maximum depth of the tree."),
	'min_samples_split': st.slider("Minimum Samples Split", 2, 10, 2, help="Minimum samples required to split an internal node"), #New hyperparameter
	'min_samples_leaf': st.slider("Minimum Samples Leaf", 1, 10, 1, help="Minimum samples required to be at a leaf node"), #New hyperparameter
	}

	#Inside the train model button
	if st.button("Train Model"):
	#Feature Selection
	if feature_selection_method == "SelectKBest":
	k = st.slider("Number of Features to Select", 1, len(feature_columns), len(feature_columns))
	selector = SelectKBest(k=k)
	X_train_selected = selector.fit_transform(X_train_processed, y_train)
	X_test_selected = selector.transform(X_test_processed)
	else:
	X_train_selected = X_train_processed
	X_test_selected = X_test_processed
	# Model Training and Hyperparameter Tuning
	if model_name == "Linear Regression":
	model = LinearRegression()
	elif model_name == "Logistic Regression":
	model = LogisticRegression(max_iter=1000)
	elif model_name == "Decision Tree":
	if problem_type == "Regression":
	model = DecisionTreeRegressor()
	else:
	model = DecisionTreeClassifier()
	elif model_name == "Random Forest":
	if problem_type == "Regression":
	model = RandomForestRegressor(random_state=42)
	grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error') # Example scoring
	grid_search.fit(X_train_selected, y_train)
	model = grid_search.best_estimator_
	st.write("Best Parameters:", grid_search.best_params_)
	else:
	model = RandomForestClassifier(random_state=42)
	grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
	grid_search.fit(X_train_selected, y_train)
	model = grid_search.best_estimator_
	st.write("Best Parameters:", grid_search.best_params_)

	elif model_name == "Gradient Boosting":
	model = GradientBoostingRegressor() if problem_type == "Regression" else GradientBoostingClassifier()
	elif model_name == "SVM":
	model = SVR() if problem_type == "Regression" else SVC()

	# Cross-validation
	cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5) #example, adjust cv
	st.write(f"Cross-validation scores: {cv_scores}")
	st.write(f"Mean cross-validation score: {cv_scores.mean():.4f}")

	model.fit(X_train_selected, y_train)

	# Model Saving
	model_filename = st.text_input("Enter Model Filename (without extension)", "trained_model")
	if st.button("Save Model"):
	try:
	joblib.dump(st.session_state.model, f"{model_filename}.joblib")
	st.success(f"Model saved as {model_filename}.joblib")
	except Exception as e:
	st.error(f"Error saving model: {e}")
	# Model loading in a different section
	model_file = st.file_uploader("Upload Trained Model", type=["joblib"])
	if model_file is not None:
	try:
	st.session_state.model = joblib.load(model_file)
	st.success("Model loaded successfully!")
	except Exception as e:
	st.error(f"Error loading model: {e}")

	#Model Evaluation Section
	y_pred = model.predict(X_test_selected)

	if problem_type == "Regression":
	mse = mean_squared_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)
	st.write(f"Mean Squared Error: {mse:.4f}")
	st.write(f"R-squared: {r2:.4f}")
	else:
	accuracy = accuracy_score(y_test, y_pred)
	st.write(f"Accuracy: {accuracy:.4f}")

	elif app_mode == "Predictions":
	st.title("🔮 Make Predictions")

	if st.session_state.model is not None and st.session_state.cleaned_data is not None:
	df = st.session_state.cleaned_data.copy()

	# Input data for prediction
	st.subheader("Enter Data for Prediction")
	input_data = {}
	model_columns = st.session_state.model.steps[0][1].transformers_[0][2] + st.session_state.model.steps[0][1].transformers_[1][2]
	if not set(model_columns).issubset(set(df.drop(columns=[st.session_state.model.steps[-1][0]]).columns)):
	st.error("The model was trained on a dataframe that contains different columns than the currently uploaded dataframe. Please upload the correct dataframe.")
	st.stop()

	for col in model_columns:
	if pd.api.types.is_numeric_dtype(df[col]):
	input_data[col] = st.number_input(f"Enter {col}", value=df[col].mean())
	else:
	input_data[col] = st.selectbox(f"Select {col}", df[col].unique())

	# Prediction Button
	if st.button("Make Prediction"):
	try:
	input_df = pd.DataFrame([input_data])
	prediction = st.session_state.model.predict(input_df)[0]
	st.subheader("Prediction Result")
	st.write(f"The predicted value is: {prediction}")

	# Additional Feedback (Example for Classification)
	if isinstance(st.session_state.model.steps[-1][1], LogisticRegression):
	probabilities = st.session_state.model.predict_proba(input_df)[0]
	st.write("Predicted Probabilities:")
	st.write(probabilities)

	except Exception as e:
	st.error(f"An error occurred during prediction: {e}")
	else:
	st.write("Please train a model first in the 'Model Training' section.")

	#Add batch prediction section in prediction tab
	st.subheader("Batch Predictions")
	batch_file = st.file_uploader("Upload CSV for Batch Predictions", type=["csv"])
	if batch_file is not None:
	try:
	batch_df = pd.read_csv(batch_file)
	# Preprocess the batch data
	batch_processed = st.session_state.preprocessor.transform(batch_df)
	# Make predictions
	batch_predictions = st.session_state.model.predict(batch_processed)
	batch_df['Prediction'] = batch_predictions
	st.dataframe(batch_df)

	# Download predictions
	csv = batch_df.to_csv(index=False)
	b64 = base64.b64encode(csv.encode()).decode() # some strings
	href = f'<a href="data:file/csv;base64,{b64}" download="predictions.csv">Download Predictions CSV</a>'
	st.markdown(href, unsafe_allow_html=True)

	except Exception as e:
	st.error(f"Error processing batch file: {e}")


	elif app_mode == "Visualization Lab":
	st.title("🔬 Advanced Data Visualization and Clustering Lab")

	# Initialize session state for cleaned data
	if 'cleaned_data' not in st.session_state:
	st.session_state.cleaned_data = None

	# Sample data upload (replace with your data loading logic)
	uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
	if uploaded_file is not None:
	try:
	df = pd.read_csv(uploaded_file)
	st.session_state.cleaned_data = df
	st.success("Data loaded successfully!")
	except Exception as e:
	st.error(f"Error loading data: {e}")

	if st.session_state.cleaned_data is not None:
	df = st.session_state.cleaned_data.copy()

	# Visualization Type Selection
	visualization_type = st.selectbox("Select Visualization Type", [
	"Pair Plot", "Parallel Coordinates Plot", "Andrews Curves", "Pie Chart",
	"Area Chart", "Density Contour", "Sunburst Chart", "Funnel Chart", "Clustering Analysis"
	])

	if visualization_type == "Pair Plot":
	st.subheader("Pair Plot")
	cols_for_pairplot = st.multiselect("Select Columns for Pair Plot", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
	if cols_for_pairplot:
	fig = px.scatter_matrix(df, dimensions=cols_for_pairplot)
	st.plotly_chart(fig, use_container_width=True)

	elif visualization_type == "Parallel Coordinates Plot":
	st.subheader("Parallel Coordinates Plot")
	cols_for_parallel = st.multiselect("Select Columns for Parallel Coordinates", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
	if cols_for_parallel:
	fig = px.parallel_coordinates(df[cols_for_parallel], color=df[cols_for_parallel[0]] if cols_for_parallel else None)
	st.plotly_chart(fig, use_container_width=True)

	elif visualization_type == "Andrews Curves":
	st.subheader("Andrews Curves")
	cols_for_andrews = st.multiselect("Select Columns for Andrews Curves", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:5])
	if cols_for_andrews:
	fig = px.andrews_curves(df[cols_for_andrews + [df.columns[0]]], class_column=df.columns[0])
	st.plotly_chart(fig, use_container_width=True)

	elif visualization_type == "Pie Chart":
	st.subheader("Pie Chart")
	col_for_pie = st.selectbox("Select Column for Pie Chart", df.columns)
	fig = px.pie(df, names=col_for_pie)
	st.plotly_chart(fig, use_container_width=True)

	elif visualization_type == "Area Chart":
	st.subheader("Area Chart")
	cols_for_area = st.multiselect("Select Columns for Area Chart", df.select_dtypes(include=np.number).columns.tolist(), default=df.select_dtypes(include=np.number).columns.tolist()[:3])
	if cols_for_area:
	fig = px.area(df[cols_for_area])
	st.plotly_chart(fig, use_container_width=True)

	elif visualization_type == "Density Contour":
	st.subheader("Density Contour")
	x_col = st.selectbox("Select X Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
	y_col = st.selectbox("Select Y Column for Density Contour", df.select_dtypes(include=np.number).columns.tolist())
	fig = px.density_contour(df, x=x_col, y=y_col)
	st.plotly_chart(fig, use_container_width=True)

	elif visualization_type == "Sunburst Chart":
	st.subheader("Sunburst Chart")
	path_cols = st.multiselect("Select Path Columns for Sunburst Chart", df.columns)
	if path_cols:
	fig = px.sunburst(df, path=path_cols)
	st.plotly_chart(fig, use_container_width=True)

	elif visualization_type == "Funnel Chart":
	st.subheader("Funnel Chart")
	x_col = st.selectbox("Select X Column for Funnel Chart (Values)", df.select_dtypes(include=np.number).columns.tolist())
	y_col = st.selectbox("Select Y Column for Funnel Chart (Categories)", df.columns)
	fig = px.funnel(df, x=x_col, y=y_col)
	st.plotly_chart(fig, use_container_width=True)

	elif visualization_type == "Clustering Analysis":
	st.subheader("Clustering Analysis")
	numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

	if not numerical_cols:
	st.warning("No numerical columns found for clustering.")
	else:
	cluster_cols = st.multiselect("Select Columns for Clustering", numerical_cols, default=numerical_cols[:2] if len(numerical_cols) >= 2 else numerical_cols)

	if cluster_cols:
	try:
	scaler = StandardScaler()
	scaled_data = scaler.fit_transform(df[cluster_cols])
	n_clusters = st.slider("Number of Clusters", 2, 10, 3, help="Number of clusters to form.")
	kmeans = KMeans(n_clusters=n_clusters, random_state=42)
	clusters = kmeans.fit_predict(scaled_data)
	df['Cluster'] = clusters

	if len(cluster_cols) == 2:
	fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
	st.plotly_chart(fig, use_container_width=True)
	elif len(cluster_cols) == 3:
	fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.write("Clustering visualization is only supported for 2 or 3 selected columns.")
	st.success("Clustering applied successfully!")
	except Exception as e:
	st.error(f"An error occurred during clustering: {e}")
	#Add clustering performance in clustering analysis
	if len(cluster_cols) >= 2: # Evaluate Silhouette Score
	try:
	silhouette_avg = silhouette_score(scaled_data, clusters)
	st.write(f"Silhouette Score: {silhouette_avg:.4f}")
	except:
	st.write("Could not compute silhouette score")

	#Add dimensionality reduction option and 2d/3d plots

	dimension_reduction = st.selectbox("Dimensionality Reduction", ["None", "PCA"])
	if dimension_reduction == "PCA":
	n_components = st.slider("Number of Components", 2, min(3, len(cluster_cols)), 2)
	pca = PCA(n_components=n_components)
	principal_components = pca.fit_transform(scaled_data)
	pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i + 1}' for i in range(n_components)])
	pca_df['Cluster'] = clusters # Add Cluster

	if len(cluster_cols) >= 2: #plotting section
	fig = None #Initialize fig
	if dimension_reduction == "None":
	if len(cluster_cols) == 2:
	fig = px.scatter(df, x=cluster_cols[0], y=cluster_cols[1], color='Cluster', title="K-Means Clustering")
	st.plotly_chart(fig, use_container_width=True)
	elif len(cluster_cols) == 3:
	fig = px.scatter_3d(df, x=cluster_cols[0], y=cluster_cols[1], z=cluster_cols[2], color='Cluster', title="K-Means Clustering (3D)")
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.write("Clustering visualization is only supported for 2 or 3 selected columns.")

	elif dimension_reduction == "PCA":
	if n_components == 2:
	fig = px.scatter(pca_df, x='PC1', y='PC2', color='Cluster', title="K-Means Clustering (PCA - 2D)")
	st.plotly_chart(fig, use_container_width=True)
	elif n_components == 3:
	fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster', title="K-Means Clustering (PCA - 3D)")
	st.plotly_chart(fig, use_container_width=True)

	else:
	st.write("PCA visualization is only supported for 2 or 3 components.")

	elif app_mode == "Neural Network Studio":
	st.title("🧠 Neural Network Studio")

	if st.session_state.cleaned_data is not None:
	df = st.session_state.cleaned_data.copy()

	# Target Variable Selection
	target_column = st.selectbox("Select Target Variable", df.columns, help="Choose the column you want to predict.")

	# Problem Type Selection
	problem_type = st.radio("Select Problem Type", ["Regression", "Classification"], help="Choose the type of machine learning problem.")

	# Feature Selection (optional)
	use_all_features = st.checkbox("Use All Features", value=True, help="Select to use all features for training. Deselect to manually choose features.")
	if use_all_features:
	feature_columns = df.drop(columns=[target_column]).columns.tolist()
	else:
	feature_columns = st.multiselect("Select Feature Columns", df.drop(columns=[target_column]).columns, help="Choose the features you want to use for prediction.")

	# Model Selection
	model_type = st.selectbox("Select Neural Network Model", [
	"Simple Neural Network", "Convolutional Neural Network (CNN)", "Recurrent Neural Network (RNN)"
	], help="Choose the neural network model to use.")

	# Hyperparameter Tuning
	with st.expander("Hyperparameter Tuning", expanded=False):
	if model_type == "Simple Neural Network":
	hidden_layers = st.slider("Number of Hidden Layers", 1, 5, 2, help="Number of hidden layers in the network.")
	neurons_per_layer = st.slider("Neurons per Layer", 10, 200, 50, help="Number of neurons in each hidden layer.")
	epochs = st.slider("Epochs", 10, 200, 50, help="Number of epochs for training.")
	batch_size = st.slider("Batch Size", 16, 128, 32, help="Batch size for training.")
	elif model_type == "Convolutional Neural Network (CNN)":
	epochs_cnn = st.slider("Epochs", 10, 200, 50, help="Number of epochs for CNN training.")
	batch_size_cnn = st.slider("Batch Size", 16, 128, 32, help="Batch size for CNN training.")
	elif model_type == "Recurrent Neural Network (RNN)":
	epochs_rnn = st.slider("Epochs", 10, 200, 50, help="Number of epochs for RNN training.")
	batch_size_rnn = st.slider("Batch Size", 16, 128, 32, help="Batch size for RNN training.")
	sequence_length = st.slider("Sequence Length (for RNN)", 10, 100, 30, help="Length of the input sequences for RNN.")
	# Train-Test Split
	test_size = st.slider("Test Size", 0.1, 0.5, 0.2, help="Proportion of the data to use for testing.")

	# Model Training Button
	if st.button("Train Neural Network Model"):
	with st.spinner("Training neural network model..."):
	try:
	# Split data
	X = df[feature_columns]
	y = df[target_column]
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

	# Preprocessing
	numeric_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	])
	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(handle_unknown='ignore'))
	])

	numeric_features = X_train.select_dtypes(include=np.number).columns
	categorical_features = X_train.select_dtypes(include='object').columns

	preprocessor = ColumnTransformer(
	transformers=[
	('num', numeric_transformer, numeric_features),
	('cat', categorical_transformer, categorical_features)
	])

	X_train_processed = preprocessor.fit_transform(X_train)
	X_test_processed = preprocessor.transform(X_test)

	# Neural Network Model Selection and Training
	tf.random.set_seed(42) # for reproducibility

	# Callbacks (Early Stopping)
	early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

	if model_type == "Simple Neural Network":
	model = keras.Sequential()
	model.add(layers.Input(shape=(X_train_processed.shape[1],)))
	for _ in range(hidden_layers):
	model.add(layers.Dense(neurons_per_layer, activation=activation)) # Use the selected activation
	model.add(
	layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
	activation='linear' if problem_type == "Regression" else 'softmax'))

	optimizer = keras.optimizers.Adam(learning_rate=learning_rate) # Use the learning rate

	model.compile(optimizer=optimizer,
	loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
	metrics=['mae'] if problem_type == "Regression" else ['accuracy'])

	history = model.fit(X_train_processed, y_train, epochs=epochs, batch_size=batch_size,
	validation_split=0.2, verbose=0,
	callbacks=[early_stopping]) # Added early stopping

	y_pred = model.predict(X_test_processed)
	if problem_type == "Classification":
	y_pred = np.argmax(y_pred, axis=1)

	elif model_type == "Convolutional Neural Network (CNN)":
	X_train_cnn = np.expand_dims(X_train_processed, axis=2)
	X_test_cnn = np.expand_dims(X_test_processed, axis=2)

	model = keras.Sequential()
	model.add(layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu',
	input_shape=(X_train_cnn.shape[1], 1)))
	model.add(layers.MaxPooling1D(pool_size=pooling_size))
	model.add(layers.Flatten())
	model.add(layers.Dense(50, activation='relu'))
	model.add(
	layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
	activation='linear' if problem_type == "Regression" else 'softmax'))

	optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
	model.compile(optimizer=optimizer,
	loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
	metrics=['mae'] if problem_type == "Regression" else ['accuracy'])

	history = model.fit(X_train_cnn, y_train, epochs=epochs_cnn, batch_size=batch_size_cnn,
	validation_split=0.2, verbose=0,
	callbacks=[early_stopping])

	y_pred = model.predict(X_test_cnn)
	if problem_type == "Classification":
	y_pred = np.argmax(y_pred, axis=1)

	elif model_type == "Recurrent Neural Network (RNN)":
	try:
	X_train_rnn = np.reshape(X_train_processed, (
	X_train_processed.shape[0], sequence_length,
	X_train_processed.shape[1] // sequence_length))
	X_test_rnn = np.reshape(X_test_processed, (
	X_test_processed.shape[0], sequence_length, X_test_processed.shape[1] // sequence_length))

	model = keras.Sequential()
	model.add(layers.SimpleRNN(units, activation='relu', # Use the selected units
	dropout=dropout_rate,
	input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])))
	model.add(
	layers.Dense(1 if problem_type == "Regression" else len(np.unique(y_train)),
	activation='linear' if problem_type == "Regression" else 'softmax'))

	optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
	model.compile(optimizer=optimizer,
	loss='mse' if problem_type == "Regression" else 'sparse_categorical_crossentropy',
	metrics=['mae'] if problem_type == "Regression" else ['accuracy'])

	history = model.fit(X_train_rnn, y_train, epochs=epochs_rnn, batch_size=batch_size_rnn,
	validation_split=0.2, verbose=0,
	callbacks=[early_stopping])

	y_pred = model.predict(X_test_rnn)
	if problem_type == "Classification":
	y_pred = np.argmax(y_pred, axis=1)
	except Exception as e:
	st.error(f"Error during RNN training: {e}")
	st.stop() # Stop execution if RNN fails

	# Evaluation
	if problem_type == "Regression":
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	mae = mean_absolute_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)
	st.write(f"Mean Squared Error: {mse:.4f}")
	st.write(f"Root Mean Squared Error: {rmse:.4f}")
	st.write(f"Mean Absolute Error: {mae:.4f}")
	st.write(f"R-squared: {r2:.4f}")
	else:
	accuracy = accuracy_score(y_test, y_pred)
	precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
	recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
	f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
	st.write(f"Accuracy: {accuracy:.4f}")
	st.write(f"Precision: {precision:.4f}")
	st.write(f"Recall: {recall:.4f}")
	st.write(f"F1 Score: {f1:.4f}")
	st.write("Classification Report:")
	st.text(classification_report(y_test, y_pred))

	# Visualization
	st.subheader("Training History")
	fig, ax = plt.subplots() # Use matplotlib directly

	ax.plot(history.history['loss'], label='loss')
	ax.plot(history.history['val_loss'], label='val_loss')
	ax.set_xlabel('Epoch')
	ax.set_ylabel('Loss')
	ax.legend()
	st.pyplot(fig) # Display with st.pyplot

	st.success("Model trained successfully!")

	except Exception as e:
	st.error(f"An error occurred during training: {e}")

	except Exception as e:
	st.error(f"An error occurred during training: {e}")