Spaces:
Sleeping
Sleeping
| # app.py | |
| import streamlit as st | |
| import pandas as pd | |
| #import seaborn as sns | |
| #import matplotlib.pyplot as plt | |
| #import statsmodels.api as sm | |
| st.set_option('deprecation.showPyplotGlobalUse', False) | |
| uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) | |
| st.title("EDA: Descriptive Analyzer") | |
| # Read the dataset | |
| if uploaded_file is not None: | |
| df = pd.DataFrame() | |
| intial_df = pd.read_csv(uploaded_file) | |
| df=intial_df | |
| def descriptive_analysis(): | |
| global df | |
| groups = { | |
| "Descriptive Statistics": ["count", "sum", "mean", "median", "min", "max", "std", "var", "quantile"], | |
| "Aggregation": ["sum", "mean", "median", "std"], #"agg" | |
| # "Cumulative Statistics": ["cumsum", "cumprod", "cummax", "cummin"],# all | |
| # "Correlation and Covariance": ["corr", "cov"],#all | |
| "Value Counts": [ "nunique"], #["value_counts", "unique"] | |
| "Quantiles and Percentiles": ["quantile"], # showing only 0.5 | |
| "Miscellaneous Statistics": ["prod", "skew", "kurt"], # mad | |
| # "Histograms": ["hist"],# all | |
| # "Central Tendency": ["mode"],# all | |
| # "Missing Data Statistics": ["isna", "notna", "dropna"],# all | |
| # "Categorical Statistics": ["describe", "count_categorical"] #all | |
| } | |
| selected_group = st.sidebar.selectbox("Select Analysis Type", list(groups.keys())) | |
| # Create separate dropdowns and result tables for the selected group | |
| st.write(f"## {selected_group}") | |
| # Multi-select for selecting functions in the group | |
| selected_functions = st.multiselect(f"Select functions in {selected_group}", groups[selected_group]) | |
| if not selected_functions: | |
| st.info("Please select at least one function.") | |
| else: | |
| # Create an empty DataFrame to store the results | |
| results_df = pd.DataFrame() | |
| function_list=[] | |
| # Compute and concatenate results based on user selection | |
| for function in selected_functions: | |
| if function == "quantile": | |
| # For quantile_series, user needs to provide a list of quantiles | |
| #quantiles = st.text_input(f"Enter quantiles for {function} (comma-separated):", "0.25,0.5,0.75") | |
| quantiles = [0.25,0.5,0.75] | |
| result_25 = df.quantile(0.25) | |
| result_5 = df.quantile(0.5) | |
| result_75 = df.quantile(0.75) | |
| result = pd.concat([result_25, result_5, result_75], axis=1) | |
| function_list.append('Quantite-0.25') | |
| function_list.append('Quantite-0.5') | |
| function_list.append('Quantite-0.75') | |
| else: | |
| # For other functions, apply the selected function to the DataFrame | |
| result = getattr(df, function)() | |
| function_list.append(function) | |
| # Concatenate the result along columns | |
| results_df = pd.concat([results_df, result], axis=1) | |
| # Transpose the result table | |
| results_df = results_df.transpose() | |
| results_df['Function'] = function_list | |
| results_df = results_df[['Function'] + [col for col in results_df.columns if col != 'Function']] | |
| # Display the transposed results | |
| st.write("### Results:") | |
| st.dataframe(results_df, hide_index = True) | |
| def data_visualization(): | |
| global df | |
| visuals=["Line Plot", "Bar Chart", "Histogram","Scatter Plot", "Box Plot", "Violin Plot","Heatmap", "Pair Plot", "Pie Chart"] | |
| data=pd.DataFrame(df) | |
| selected_chart = st.sidebar.selectbox("Select Visualization Type", list(visuals)) | |
| sns.boxplot(x=df['Age']) | |
| st.pyplot() | |
| # Display selected chart | |
| if selected_chart == "Line Plot": | |
| st.subheader("Line Plot") | |
| x_col=st.selectbox("Select column for x-axis:",df.columns) | |
| y_col=st.selectbox("Select column for y-axis:",df.columns) | |
| plt.scatter(df[x_col],df[y_col]) | |
| st.pyplot() | |
| elif selected_chart == "Bar Chart": | |
| col=st.multiselect("Select columns for bar-chart",df.columns) | |
| plt.bar(col,height=[range(len(col))]) | |
| st.pyplot() | |
| elif selected_chart == "Histogram": | |
| st.subheader("Histogram") | |
| plt.hist(data['value'], bins=10) | |
| st.pyplot() | |
| elif selected_chart == "Scatter Plot": | |
| st.subheader("Scatter Plot") | |
| sm.qqplot(data, line='45') | |
| st.pyplot() | |
| elif selected_chart == "Box Plot": | |
| st.subheader("Box Plot") | |
| sns.boxplot(x='category', y='value', data=data) | |
| st.pyplot() | |
| elif selected_chart == "Violin Plot": | |
| st.subheader("Violin Plot") | |
| sns.violinplot(x='category', y='value', data=data) | |
| st.pyplot() | |
| elif selected_chart == "Pair Plot": | |
| st.subheader("Pair Plot") | |
| sns.pairplot(data) | |
| st.pyplot() | |
| elif selected_chart == "Pie Chart": | |
| st.subheader("Pie Chart") | |
| sizes = [15, 30, 45] | |
| labels = ['Category A', 'Category B', 'Category C'] | |
| plt.pie(sizes, labels=labels, autopct='%1.1f%%') | |
| st.pyplot() | |
| def collinearity_pairs(): | |
| global df | |
| st.write("### Collinearity") | |
| st.sidebar.markdown("[Collinearity](#collinearity)") | |
| # Set your collinearity threshold (e.g., 0.7) | |
| st.subheader("Heatmap") | |
| sns.heatmap(df.corr(), annot=True, cmap='coolwarm') | |
| st.pyplot() | |
| collinearity_threshold = st.number_input("Enter collinearity threshold from range [0 1]:") | |
| # Calculate the correlation matrix | |
| correlation_matrix = df.corr() | |
| # Find distinct column pairs with collinearity above the threshold | |
| high_collinear_pairs = ( | |
| (correlation_matrix.abs() > collinearity_threshold) & (correlation_matrix < 1) | |
| ).stack().reset_index() | |
| # Rename the columns for clarity | |
| high_collinear_pairs.columns = ['Column1', 'Column2', 'Collinearity'] | |
| # Filter for pairs with collinearity above the threshold | |
| high_collinear_pairs = high_collinear_pairs[high_collinear_pairs['Collinearity']] | |
| # Create a list to store the column pairs and their collinearity | |
| df_col = [] | |
| distinct_col = set() | |
| for index, row in high_collinear_pairs.iterrows(): | |
| col1, col2 = row['Column1'], row['Column2'] | |
| df_col.append([col1, col2]) | |
| distinct_col.add(col1) | |
| distinct_col.add(col2) | |
| df_col = pd.DataFrame(df_col) | |
| st.write(f"Number of distinct pairs: {len(distinct_col)}") | |
| st.write("Collinearity Pairs") | |
| st.dataframe(df_col) | |
| def missing_values(): | |
| global df | |
| st.write("### Missing Values") | |
| st.sidebar.markdown("[Missing Values](#missing-values)") | |
| methods=["None","dropna","Value","mean","Previous Value","Next Value","interpolate"] | |
| selected_missing = st.selectbox("Select Missing Values handling method",methods) | |
| if selected_missing == "None": | |
| df=df | |
| elif selected_missing == "dropna": | |
| df.dropna(inplace=True) | |
| elif selected_missing == "Value": | |
| value = st.text_input("Enter value:") | |
| df.fillna(value, inplace=True) | |
| elif selected_missing == "mean": | |
| df.fillna(df.mean(), inplace=True) | |
| elif selected_missing == "Previous Value": | |
| df.ffill(inplace=True) | |
| elif selected_missing == "Next Value": | |
| df.bfill(inplace=True) | |
| elif selected_missing == "interpolate": | |
| df.interpolate(inplace=True) | |
| def replace_value(): | |
| global df | |
| st.write("### Replace Value") | |
| st.sidebar.markdown("[Replace Value](#replace-value)") | |
| prev = st.text_input("Enter value to be changed") | |
| change = st.text_input("Enter new value") | |
| st.text("Data Type:") | |
| intD = st.button("Int") | |
| floatD = st.button("Float") | |
| if intD: | |
| prev=int(prev) | |
| new=int(prev) | |
| elif floatD: | |
| prev=float(prev) | |
| new=float(prev) | |
| df=df.replace(prev, change, inplace=True) | |
| def display_df(): | |
| global df | |
| st.dataframe(df) | |
| def reset_df(): | |
| global df | |
| global intial_df | |
| st.write("### Reset Data Set") | |
| st.sidebar.markdown("[Reset Data Set](#reset-data-set)") | |
| result = st.button("Reset Data Set") | |
| if result: | |
| st.write("Data Set reset complete.") | |
| df = intial_df | |
| def main(): | |
| global df | |
| global intial_df | |
| st.sidebar.title("EDA Stages") | |
| reset_df() | |
| st.sidebar.markdown("[Drop columns](#drop-columns)") | |
| # drop columns | |
| st.write("### Drop columns") | |
| data_cols = df.columns | |
| selected_cols = st.multiselect("Select any columns to be dropped", data_cols) | |
| if selected_cols: | |
| df=df.drop(columns=selected_cols) | |
| st.write(f"Columns Dropped:{selected_cols}") | |
| st.sidebar.markdown("[Dataset](#dataset)") | |
| st.write("### Dataset") | |
| res = st.button("Show Dataset") | |
| if res: | |
| display_df() | |
| descriptive_analysis() | |
| # replace_value() | |
| # missing_values() | |
| # collinearity_pairs() | |
| # data_visualization() | |
| # File upload | |
| if uploaded_file is not None: | |
| main() | |