Spaces:

HarnithaS
/

EDA_DescriptiveAnalyzer

Sleeping

Harnitha Suresh

add pre-requirements.txt filenit fix

245b95b over 1 year ago

8.98 kB

	# app.py
	import streamlit as st
	import pandas as pd
	#import seaborn as sns
	#import matplotlib.pyplot as plt
	#import statsmodels.api as sm
	st.set_option('deprecation.showPyplotGlobalUse', False)

	uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])

	st.title("EDA: Descriptive Analyzer")
	# Read the dataset
	if uploaded_file is not None:
	df = pd.DataFrame()
	intial_df = pd.read_csv(uploaded_file)
	df=intial_df



	def descriptive_analysis():
	global df
	groups = {
	"Descriptive Statistics": ["count", "sum", "mean", "median", "min", "max", "std", "var", "quantile"],
	"Aggregation": ["sum", "mean", "median", "std"], #"agg"
	# "Cumulative Statistics": ["cumsum", "cumprod", "cummax", "cummin"],# all
	# "Correlation and Covariance": ["corr", "cov"],#all
	"Value Counts": [ "nunique"], #["value_counts", "unique"]
	"Quantiles and Percentiles": ["quantile"], # showing only 0.5
	"Miscellaneous Statistics": ["prod", "skew", "kurt"], # mad
	# "Histograms": ["hist"],# all
	# "Central Tendency": ["mode"],# all
	# "Missing Data Statistics": ["isna", "notna", "dropna"],# all
	# "Categorical Statistics": ["describe", "count_categorical"] #all
	}
	selected_group = st.sidebar.selectbox("Select Analysis Type", list(groups.keys()))

	# Create separate dropdowns and result tables for the selected group
	st.write(f"## {selected_group}")

	# Multi-select for selecting functions in the group
	selected_functions = st.multiselect(f"Select functions in {selected_group}", groups[selected_group])

	if not selected_functions:
	st.info("Please select at least one function.")
	else:
	# Create an empty DataFrame to store the results
	results_df = pd.DataFrame()
	function_list=[]

	# Compute and concatenate results based on user selection
	for function in selected_functions:
	if function == "quantile":
	# For quantile_series, user needs to provide a list of quantiles
	#quantiles = st.text_input(f"Enter quantiles for {function} (comma-separated):", "0.25,0.5,0.75")
	quantiles = [0.25,0.5,0.75]
	result_25 = df.quantile(0.25)
	result_5 = df.quantile(0.5)
	result_75 = df.quantile(0.75)
	result = pd.concat([result_25, result_5, result_75], axis=1)
	function_list.append('Quantite-0.25')
	function_list.append('Quantite-0.5')
	function_list.append('Quantite-0.75')
	else:
	# For other functions, apply the selected function to the DataFrame
	result = getattr(df, function)()
	function_list.append(function)

	# Concatenate the result along columns
	results_df = pd.concat([results_df, result], axis=1)


	# Transpose the result table
	results_df = results_df.transpose()
	results_df['Function'] = function_list
	results_df = results_df[['Function'] + [col for col in results_df.columns if col != 'Function']]

	# Display the transposed results
	st.write("### Results:")
	st.dataframe(results_df, hide_index = True)

	def data_visualization():
	global df
	visuals=["Line Plot", "Bar Chart", "Histogram","Scatter Plot", "Box Plot", "Violin Plot","Heatmap", "Pair Plot", "Pie Chart"]
	data=pd.DataFrame(df)
	selected_chart = st.sidebar.selectbox("Select Visualization Type", list(visuals))
	sns.boxplot(x=df['Age'])
	st.pyplot()

	# Display selected chart
	if selected_chart == "Line Plot":
	st.subheader("Line Plot")
	x_col=st.selectbox("Select column for x-axis:",df.columns)
	y_col=st.selectbox("Select column for y-axis:",df.columns)
	plt.scatter(df[x_col],df[y_col])
	st.pyplot()

	elif selected_chart == "Bar Chart":
	col=st.multiselect("Select columns for bar-chart",df.columns)
	plt.bar(col,height=[range(len(col))])
	st.pyplot()

	elif selected_chart == "Histogram":
	st.subheader("Histogram")
	plt.hist(data['value'], bins=10)
	st.pyplot()

	elif selected_chart == "Scatter Plot":
	st.subheader("Scatter Plot")
	sm.qqplot(data, line='45')
	st.pyplot()

	elif selected_chart == "Box Plot":
	st.subheader("Box Plot")
	sns.boxplot(x='category', y='value', data=data)
	st.pyplot()

	elif selected_chart == "Violin Plot":
	st.subheader("Violin Plot")
	sns.violinplot(x='category', y='value', data=data)
	st.pyplot()


	elif selected_chart == "Pair Plot":
	st.subheader("Pair Plot")
	sns.pairplot(data)
	st.pyplot()

	elif selected_chart == "Pie Chart":
	st.subheader("Pie Chart")
	sizes = [15, 30, 45]
	labels = ['Category A', 'Category B', 'Category C']
	plt.pie(sizes, labels=labels, autopct='%1.1f%%')
	st.pyplot()

	def collinearity_pairs():
	global df
	st.write("### Collinearity")
	st.sidebar.markdown("[Collinearity](#collinearity)")
	# Set your collinearity threshold (e.g., 0.7)
	st.subheader("Heatmap")
	sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
	st.pyplot()
	collinearity_threshold = st.number_input("Enter collinearity threshold from range [0 1]:")

	# Calculate the correlation matrix
	correlation_matrix = df.corr()

	# Find distinct column pairs with collinearity above the threshold
	high_collinear_pairs = (
	(correlation_matrix.abs() > collinearity_threshold) & (correlation_matrix < 1)
	).stack().reset_index()

	# Rename the columns for clarity
	high_collinear_pairs.columns = ['Column1', 'Column2', 'Collinearity']

	# Filter for pairs with collinearity above the threshold
	high_collinear_pairs = high_collinear_pairs[high_collinear_pairs['Collinearity']]

	# Create a list to store the column pairs and their collinearity
	df_col = []
	distinct_col = set()
	for index, row in high_collinear_pairs.iterrows():
	col1, col2 = row['Column1'], row['Column2']
	df_col.append([col1, col2])
	distinct_col.add(col1)
	distinct_col.add(col2)

	df_col = pd.DataFrame(df_col)
	st.write(f"Number of distinct pairs: {len(distinct_col)}")
	st.write("Collinearity Pairs")
	st.dataframe(df_col)

	def missing_values():
	global df
	st.write("### Missing Values")
	st.sidebar.markdown("[Missing Values](#missing-values)")
	methods=["None","dropna","Value","mean","Previous Value","Next Value","interpolate"]
	selected_missing = st.selectbox("Select Missing Values handling method",methods)

	if selected_missing == "None":
	df=df
	elif selected_missing == "dropna":
	df.dropna(inplace=True)
	elif selected_missing == "Value":
	value = st.text_input("Enter value:")
	df.fillna(value, inplace=True)
	elif selected_missing == "mean":
	df.fillna(df.mean(), inplace=True)
	elif selected_missing == "Previous Value":
	df.ffill(inplace=True)
	elif selected_missing == "Next Value":
	df.bfill(inplace=True)
	elif selected_missing == "interpolate":
	df.interpolate(inplace=True)


	def replace_value():
	global df
	st.write("### Replace Value")
	st.sidebar.markdown("[Replace Value](#replace-value)")
	prev = st.text_input("Enter value to be changed")
	change = st.text_input("Enter new value")
	st.text("Data Type:")
	intD = st.button("Int")
	floatD = st.button("Float")
	if intD:
	prev=int(prev)
	new=int(prev)
	elif floatD:
	prev=float(prev)
	new=float(prev)

	df=df.replace(prev, change, inplace=True)


	def display_df():
	global df
	st.dataframe(df)


	def reset_df():
	global df
	global intial_df
	st.write("### Reset Data Set")
	st.sidebar.markdown("[Reset Data Set](#reset-data-set)")
	result = st.button("Reset Data Set")
	if result:
	st.write("Data Set reset complete.")
	df = intial_df


	def main():
	global df
	global intial_df
	st.sidebar.title("EDA Stages")
	reset_df()

	st.sidebar.markdown("[Drop columns](#drop-columns)")
	# drop columns
	st.write("### Drop columns")
	data_cols = df.columns
	selected_cols = st.multiselect("Select any columns to be dropped", data_cols)
	if selected_cols:
	df=df.drop(columns=selected_cols)
	st.write(f"Columns Dropped:{selected_cols}")
	st.sidebar.markdown("[Dataset](#dataset)")
	st.write("### Dataset")
	res = st.button("Show Dataset")
	if res:
	display_df()
	descriptive_analysis()
	# replace_value()
	# missing_values()
	# collinearity_pairs()
	# data_visualization()



	# File upload

	if uploaded_file is not None:
	main()