# app.py import streamlit as st import pandas as pd #import seaborn as sns #import matplotlib.pyplot as plt #import statsmodels.api as sm st.set_option('deprecation.showPyplotGlobalUse', False) uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) st.title("EDA: Descriptive Analyzer") # Read the dataset if uploaded_file is not None: df = pd.DataFrame() intial_df = pd.read_csv(uploaded_file) df=intial_df def descriptive_analysis(): global df groups = { "Descriptive Statistics": ["count", "sum", "mean", "median", "min", "max", "std", "var", "quantile"], "Aggregation": ["sum", "mean", "median", "std"], #"agg" # "Cumulative Statistics": ["cumsum", "cumprod", "cummax", "cummin"],# all # "Correlation and Covariance": ["corr", "cov"],#all "Value Counts": [ "nunique"], #["value_counts", "unique"] "Quantiles and Percentiles": ["quantile"], # showing only 0.5 "Miscellaneous Statistics": ["prod", "skew", "kurt"], # mad # "Histograms": ["hist"],# all # "Central Tendency": ["mode"],# all # "Missing Data Statistics": ["isna", "notna", "dropna"],# all # "Categorical Statistics": ["describe", "count_categorical"] #all } selected_group = st.sidebar.selectbox("Select Analysis Type", list(groups.keys())) # Create separate dropdowns and result tables for the selected group st.write(f"## {selected_group}") # Multi-select for selecting functions in the group selected_functions = st.multiselect(f"Select functions in {selected_group}", groups[selected_group]) if not selected_functions: st.info("Please select at least one function.") else: # Create an empty DataFrame to store the results results_df = pd.DataFrame() function_list=[] # Compute and concatenate results based on user selection for function in selected_functions: if function == "quantile": # For quantile_series, user needs to provide a list of quantiles #quantiles = st.text_input(f"Enter quantiles for {function} (comma-separated):", "0.25,0.5,0.75") quantiles = [0.25,0.5,0.75] result_25 = df.quantile(0.25) result_5 = df.quantile(0.5) result_75 = df.quantile(0.75) result = pd.concat([result_25, result_5, result_75], axis=1) function_list.append('Quantite-0.25') function_list.append('Quantite-0.5') function_list.append('Quantite-0.75') else: # For other functions, apply the selected function to the DataFrame result = getattr(df, function)() function_list.append(function) # Concatenate the result along columns results_df = pd.concat([results_df, result], axis=1) # Transpose the result table results_df = results_df.transpose() results_df['Function'] = function_list results_df = results_df[['Function'] + [col for col in results_df.columns if col != 'Function']] # Display the transposed results st.write("### Results:") st.dataframe(results_df, hide_index = True) def data_visualization(): global df visuals=["Line Plot", "Bar Chart", "Histogram","Scatter Plot", "Box Plot", "Violin Plot","Heatmap", "Pair Plot", "Pie Chart"] data=pd.DataFrame(df) selected_chart = st.sidebar.selectbox("Select Visualization Type", list(visuals)) sns.boxplot(x=df['Age']) st.pyplot() # Display selected chart if selected_chart == "Line Plot": st.subheader("Line Plot") x_col=st.selectbox("Select column for x-axis:",df.columns) y_col=st.selectbox("Select column for y-axis:",df.columns) plt.scatter(df[x_col],df[y_col]) st.pyplot() elif selected_chart == "Bar Chart": col=st.multiselect("Select columns for bar-chart",df.columns) plt.bar(col,height=[range(len(col))]) st.pyplot() elif selected_chart == "Histogram": st.subheader("Histogram") plt.hist(data['value'], bins=10) st.pyplot() elif selected_chart == "Scatter Plot": st.subheader("Scatter Plot") sm.qqplot(data, line='45') st.pyplot() elif selected_chart == "Box Plot": st.subheader("Box Plot") sns.boxplot(x='category', y='value', data=data) st.pyplot() elif selected_chart == "Violin Plot": st.subheader("Violin Plot") sns.violinplot(x='category', y='value', data=data) st.pyplot() elif selected_chart == "Pair Plot": st.subheader("Pair Plot") sns.pairplot(data) st.pyplot() elif selected_chart == "Pie Chart": st.subheader("Pie Chart") sizes = [15, 30, 45] labels = ['Category A', 'Category B', 'Category C'] plt.pie(sizes, labels=labels, autopct='%1.1f%%') st.pyplot() def collinearity_pairs(): global df st.write("### Collinearity") st.sidebar.markdown("[Collinearity](#collinearity)") # Set your collinearity threshold (e.g., 0.7) st.subheader("Heatmap") sns.heatmap(df.corr(), annot=True, cmap='coolwarm') st.pyplot() collinearity_threshold = st.number_input("Enter collinearity threshold from range [0 1]:") # Calculate the correlation matrix correlation_matrix = df.corr() # Find distinct column pairs with collinearity above the threshold high_collinear_pairs = ( (correlation_matrix.abs() > collinearity_threshold) & (correlation_matrix < 1) ).stack().reset_index() # Rename the columns for clarity high_collinear_pairs.columns = ['Column1', 'Column2', 'Collinearity'] # Filter for pairs with collinearity above the threshold high_collinear_pairs = high_collinear_pairs[high_collinear_pairs['Collinearity']] # Create a list to store the column pairs and their collinearity df_col = [] distinct_col = set() for index, row in high_collinear_pairs.iterrows(): col1, col2 = row['Column1'], row['Column2'] df_col.append([col1, col2]) distinct_col.add(col1) distinct_col.add(col2) df_col = pd.DataFrame(df_col) st.write(f"Number of distinct pairs: {len(distinct_col)}") st.write("Collinearity Pairs") st.dataframe(df_col) def missing_values(): global df st.write("### Missing Values") st.sidebar.markdown("[Missing Values](#missing-values)") methods=["None","dropna","Value","mean","Previous Value","Next Value","interpolate"] selected_missing = st.selectbox("Select Missing Values handling method",methods) if selected_missing == "None": df=df elif selected_missing == "dropna": df.dropna(inplace=True) elif selected_missing == "Value": value = st.text_input("Enter value:") df.fillna(value, inplace=True) elif selected_missing == "mean": df.fillna(df.mean(), inplace=True) elif selected_missing == "Previous Value": df.ffill(inplace=True) elif selected_missing == "Next Value": df.bfill(inplace=True) elif selected_missing == "interpolate": df.interpolate(inplace=True) def replace_value(): global df st.write("### Replace Value") st.sidebar.markdown("[Replace Value](#replace-value)") prev = st.text_input("Enter value to be changed") change = st.text_input("Enter new value") st.text("Data Type:") intD = st.button("Int") floatD = st.button("Float") if intD: prev=int(prev) new=int(prev) elif floatD: prev=float(prev) new=float(prev) df=df.replace(prev, change, inplace=True) def display_df(): global df st.dataframe(df) def reset_df(): global df global intial_df st.write("### Reset Data Set") st.sidebar.markdown("[Reset Data Set](#reset-data-set)") result = st.button("Reset Data Set") if result: st.write("Data Set reset complete.") df = intial_df def main(): global df global intial_df st.sidebar.title("EDA Stages") reset_df() st.sidebar.markdown("[Drop columns](#drop-columns)") # drop columns st.write("### Drop columns") data_cols = df.columns selected_cols = st.multiselect("Select any columns to be dropped", data_cols) if selected_cols: df=df.drop(columns=selected_cols) st.write(f"Columns Dropped:{selected_cols}") st.sidebar.markdown("[Dataset](#dataset)") st.write("### Dataset") res = st.button("Show Dataset") if res: display_df() descriptive_analysis() # replace_value() # missing_values() # collinearity_pairs() # data_visualization() # File upload if uploaded_file is not None: main()