Insights / data_analyzer.py
shrutisd1003's picture
modified app and analyzer
7c20203
raw
history blame
2.68 kB
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
class DataAnalyzer:
def __init__(self, data):
self.data = data
st.header("Exploratory Data Analysis")
def show_eda(self):
st.write("Number of rows:", self.data.shape[0])
st.write("Number of columns:", self.data.shape[1])
columns_by_dtype = {}
for column_name, dtype in self.data.dtypes.items():
dtype_str = str(dtype)
if dtype_str not in columns_by_dtype:
columns_by_dtype[dtype_str] = [column_name]
else:
columns_by_dtype[dtype_str].append(column_name)
col_type_df = []
for dtype, columns in columns_by_dtype.items():
col_type_df.append([dtype, ', '.join(columns)])
df = pd.DataFrame(col_type_df, columns=["Data Type", "Column Names"])
st.subheader("Columns by Data Type")
st.dataframe(df, hide_index=True, use_container_width=True)
def show_summary_statistics(self):
if st.button('Show Summary Statistics'):
st.write(self.data.describe())
st.write(self.data.describe(include=object))
def show_null_value_statistics(self):
st.subheader("Null Value Statistics")
null_counts = self.data.isnull().sum()
total_null = null_counts.sum()
total_rows = self.data.shape[0]
null_percentages = (null_counts / total_rows) * 100
null_stats_df = pd.DataFrame({
'Column Name': null_counts.index,
'Null Values': null_counts.values,
'Percentage Null': null_percentages.values
})
null_stats_df.loc[len(null_stats_df)] = ['Total', total_null, (total_null / (total_rows * self.data.shape[1])) * 100]
st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
def show_count_plots(self):
st.subheader("Count Plots")
sns.set(style="whitegrid")
for column_name in self.data.columns:
unique_values = self.data[column_name].nunique()
if unique_values <= 12:
fig, ax = plt.subplots(figsize=(10, 6))
sns.countplot(data=self.data, x=column_name, ax=ax)
ax.set_title(f'Count Plot of {column_name}')
ax.set_xticklabels(ax.get_xticklabels())
st.pyplot(fig)
else:
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
ax.set_title(f'Histogram of {column_name}')
ax.set_xlabel(column_name)
st.pyplot(fig)