|
|
|
import streamlit as st |
|
import pandas as pd |
|
|
|
|
|
|
|
st.set_option('deprecation.showPyplotGlobalUse', False) |
|
|
|
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) |
|
|
|
st.title("EDA: Descriptive Analyzer") |
|
|
|
if uploaded_file is not None: |
|
df = pd.DataFrame() |
|
intial_df = pd.read_csv(uploaded_file) |
|
df=intial_df |
|
|
|
|
|
|
|
def descriptive_analysis(): |
|
global df |
|
groups = { |
|
"Descriptive Statistics": ["count", "sum", "mean", "median", "min", "max", "std", "var", "quantile"], |
|
"Aggregation": ["sum", "mean", "median", "std"], |
|
|
|
|
|
"Value Counts": [ "nunique"], |
|
"Quantiles and Percentiles": ["quantile"], |
|
"Miscellaneous Statistics": ["prod", "skew", "kurt"], |
|
|
|
|
|
|
|
|
|
} |
|
selected_group = st.sidebar.selectbox("Select Analysis Type", list(groups.keys())) |
|
|
|
|
|
st.write(f"## {selected_group}") |
|
|
|
|
|
selected_functions = st.multiselect(f"Select functions in {selected_group}", groups[selected_group]) |
|
|
|
if not selected_functions: |
|
st.info("Please select at least one function.") |
|
else: |
|
|
|
results_df = pd.DataFrame() |
|
function_list=[] |
|
|
|
|
|
for function in selected_functions: |
|
if function == "quantile": |
|
|
|
|
|
quantiles = [0.25,0.5,0.75] |
|
result_25 = df.quantile(0.25) |
|
result_5 = df.quantile(0.5) |
|
result_75 = df.quantile(0.75) |
|
result = pd.concat([result_25, result_5, result_75], axis=1) |
|
function_list.append('Quantite-0.25') |
|
function_list.append('Quantite-0.5') |
|
function_list.append('Quantite-0.75') |
|
else: |
|
|
|
result = getattr(df, function)() |
|
function_list.append(function) |
|
|
|
|
|
results_df = pd.concat([results_df, result], axis=1) |
|
|
|
|
|
|
|
results_df = results_df.transpose() |
|
results_df['Function'] = function_list |
|
results_df = results_df[['Function'] + [col for col in results_df.columns if col != 'Function']] |
|
|
|
|
|
st.write("### Results:") |
|
st.dataframe(results_df, hide_index = True) |
|
|
|
def data_visualization(): |
|
global df |
|
visuals=["Line Plot", "Bar Chart", "Histogram","Scatter Plot", "Box Plot", "Violin Plot","Heatmap", "Pair Plot", "Pie Chart"] |
|
data=pd.DataFrame(df) |
|
selected_chart = st.sidebar.selectbox("Select Visualization Type", list(visuals)) |
|
sns.boxplot(x=df['Age']) |
|
st.pyplot() |
|
|
|
|
|
if selected_chart == "Line Plot": |
|
st.subheader("Line Plot") |
|
x_col=st.selectbox("Select column for x-axis:",df.columns) |
|
y_col=st.selectbox("Select column for y-axis:",df.columns) |
|
plt.scatter(df[x_col],df[y_col]) |
|
st.pyplot() |
|
|
|
elif selected_chart == "Bar Chart": |
|
col=st.multiselect("Select columns for bar-chart",df.columns) |
|
plt.bar(col,height=[range(len(col))]) |
|
st.pyplot() |
|
|
|
elif selected_chart == "Histogram": |
|
st.subheader("Histogram") |
|
plt.hist(data['value'], bins=10) |
|
st.pyplot() |
|
|
|
elif selected_chart == "Scatter Plot": |
|
st.subheader("Scatter Plot") |
|
sm.qqplot(data, line='45') |
|
st.pyplot() |
|
|
|
elif selected_chart == "Box Plot": |
|
st.subheader("Box Plot") |
|
sns.boxplot(x='category', y='value', data=data) |
|
st.pyplot() |
|
|
|
elif selected_chart == "Violin Plot": |
|
st.subheader("Violin Plot") |
|
sns.violinplot(x='category', y='value', data=data) |
|
st.pyplot() |
|
|
|
|
|
elif selected_chart == "Pair Plot": |
|
st.subheader("Pair Plot") |
|
sns.pairplot(data) |
|
st.pyplot() |
|
|
|
elif selected_chart == "Pie Chart": |
|
st.subheader("Pie Chart") |
|
sizes = [15, 30, 45] |
|
labels = ['Category A', 'Category B', 'Category C'] |
|
plt.pie(sizes, labels=labels, autopct='%1.1f%%') |
|
st.pyplot() |
|
|
|
def collinearity_pairs(): |
|
global df |
|
st.write("### Collinearity") |
|
st.sidebar.markdown("[Collinearity](#collinearity)") |
|
|
|
st.subheader("Heatmap") |
|
sns.heatmap(df.corr(), annot=True, cmap='coolwarm') |
|
st.pyplot() |
|
collinearity_threshold = st.number_input("Enter collinearity threshold from range [0 1]:") |
|
|
|
|
|
correlation_matrix = df.corr() |
|
|
|
|
|
high_collinear_pairs = ( |
|
(correlation_matrix.abs() > collinearity_threshold) & (correlation_matrix < 1) |
|
).stack().reset_index() |
|
|
|
|
|
high_collinear_pairs.columns = ['Column1', 'Column2', 'Collinearity'] |
|
|
|
|
|
high_collinear_pairs = high_collinear_pairs[high_collinear_pairs['Collinearity']] |
|
|
|
|
|
df_col = [] |
|
distinct_col = set() |
|
for index, row in high_collinear_pairs.iterrows(): |
|
col1, col2 = row['Column1'], row['Column2'] |
|
df_col.append([col1, col2]) |
|
distinct_col.add(col1) |
|
distinct_col.add(col2) |
|
|
|
df_col = pd.DataFrame(df_col) |
|
st.write(f"Number of distinct pairs: {len(distinct_col)}") |
|
st.write("Collinearity Pairs") |
|
st.dataframe(df_col) |
|
|
|
def missing_values(): |
|
global df |
|
st.write("### Missing Values") |
|
st.sidebar.markdown("[Missing Values](#missing-values)") |
|
methods=["None","dropna","Value","mean","Previous Value","Next Value","interpolate"] |
|
selected_missing = st.selectbox("Select Missing Values handling method",methods) |
|
|
|
if selected_missing == "None": |
|
df=df |
|
elif selected_missing == "dropna": |
|
df.dropna(inplace=True) |
|
elif selected_missing == "Value": |
|
value = st.text_input("Enter value:") |
|
df.fillna(value, inplace=True) |
|
elif selected_missing == "mean": |
|
df.fillna(df.mean(), inplace=True) |
|
elif selected_missing == "Previous Value": |
|
df.ffill(inplace=True) |
|
elif selected_missing == "Next Value": |
|
df.bfill(inplace=True) |
|
elif selected_missing == "interpolate": |
|
df.interpolate(inplace=True) |
|
|
|
|
|
def replace_value(): |
|
global df |
|
st.write("### Replace Value") |
|
st.sidebar.markdown("[Replace Value](#replace-value)") |
|
prev = st.text_input("Enter value to be changed") |
|
change = st.text_input("Enter new value") |
|
st.text("Data Type:") |
|
intD = st.button("Int") |
|
floatD = st.button("Float") |
|
if intD: |
|
prev=int(prev) |
|
new=int(prev) |
|
elif floatD: |
|
prev=float(prev) |
|
new=float(prev) |
|
|
|
df=df.replace(prev, change, inplace=True) |
|
|
|
|
|
def display_df(): |
|
global df |
|
st.dataframe(df) |
|
|
|
|
|
def reset_df(): |
|
global df |
|
global intial_df |
|
st.write("### Reset Data Set") |
|
st.sidebar.markdown("[Reset Data Set](#reset-data-set)") |
|
result = st.button("Reset Data Set") |
|
if result: |
|
st.write("Data Set reset complete.") |
|
df = intial_df |
|
|
|
|
|
def main(): |
|
global df |
|
global intial_df |
|
st.sidebar.title("EDA Stages") |
|
reset_df() |
|
|
|
st.sidebar.markdown("[Drop columns](#drop-columns)") |
|
|
|
st.write("### Drop columns") |
|
data_cols = df.columns |
|
selected_cols = st.multiselect("Select any columns to be dropped", data_cols) |
|
if selected_cols: |
|
df=df.drop(columns=selected_cols) |
|
st.write(f"Columns Dropped:{selected_cols}") |
|
st.sidebar.markdown("[Dataset](#dataset)") |
|
st.write("### Dataset") |
|
res = st.button("Show Dataset") |
|
if res: |
|
display_df() |
|
descriptive_analysis() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if uploaded_file is not None: |
|
main() |
|
|
|
|
|
|