Spaces:
Sleeping
Sleeping
Merge pull request #4 from Shrutisd1003/data-transformation
Browse files- app.py +46 -11
- data_analyzer.py +56 -3
- data_transformer.py +32 -0
- data_visualizer.py +33 -8
- requirements.txt +2 -1
- test.py +38 -12
app.py
CHANGED
@@ -5,27 +5,62 @@ from data_filter import DataFilter
|
|
5 |
from data_transformer import DataTransformer
|
6 |
from data_visualizer import DataVisualizer
|
7 |
from data_QA import DataQA
|
|
|
|
|
8 |
|
9 |
def main():
|
|
|
|
|
|
|
|
|
10 |
st.title('Insights 📶')
|
11 |
|
12 |
data_loader = DataLoader()
|
13 |
data = data_loader.load_data()
|
14 |
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
21 |
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
data_QA = DataQA(data)
|
29 |
-
data_QA.ask_csv()
|
30 |
if __name__ == "__main__":
|
31 |
main()
|
|
|
5 |
from data_transformer import DataTransformer
|
6 |
from data_visualizer import DataVisualizer
|
7 |
from data_QA import DataQA
|
8 |
+
import os
|
9 |
+
from streamlit_option_menu import option_menu
|
10 |
|
11 |
def main():
|
12 |
+
if os.path.exists("data.csv"):
|
13 |
+
os.remove("data.csv")
|
14 |
+
with open("data.csv", 'w'):
|
15 |
+
pass
|
16 |
st.title('Insights 📶')
|
17 |
|
18 |
data_loader = DataLoader()
|
19 |
data = data_loader.load_data()
|
20 |
|
21 |
+
if os.path.getsize("data.csv") != 0:
|
22 |
+
with st.sidebar:
|
23 |
+
selected = option_menu(
|
24 |
+
menu_title="Main Menu",
|
25 |
+
options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "Data Party"])
|
26 |
|
27 |
+
# --- DATA LOADER ---
|
28 |
+
if selected == "Data Loader":
|
29 |
+
st.toast("Data Loaded")
|
30 |
+
st.write(data.head())
|
31 |
|
32 |
+
# --- EDA ---
|
33 |
+
if selected == "Exploratory Data Analysis":
|
34 |
+
data_analyzer = DataAnalyzer(data)
|
35 |
+
data_analyzer.show_eda()
|
36 |
+
data_analyzer.show_null_value_statistics()
|
37 |
+
data_analyzer.show_count_plots()
|
38 |
+
data_analyzer.show_summary_statistics()
|
39 |
|
40 |
+
data_visualizer = DataVisualizer(data)
|
41 |
+
data_visualizer.visualize_data()
|
42 |
+
|
43 |
+
# --- DATA CLEANING ---
|
44 |
+
if selected == "Data Cleaning":
|
45 |
+
data_transformer = DataTransformer(data)
|
46 |
+
data_analyzer = DataAnalyzer(data)
|
47 |
+
modified_data = data_transformer.perform_column_operation()
|
48 |
+
data_analyzer.show_null_value_statistics()
|
49 |
+
modified_data = data_transformer.remove_null()
|
50 |
+
modified_data = data_transformer.impute_null()
|
51 |
+
modified_data = data_transformer.remove_columns()
|
52 |
+
|
53 |
+
# data_filter = DataFilter(modified_data)
|
54 |
+
# data = data_filter.filter_rows()
|
55 |
+
|
56 |
+
# --- QUESTION AND ANSWER ---
|
57 |
+
if selected == "Q/A":
|
58 |
+
data_QA = DataQA(data)
|
59 |
+
data_QA.ask_csv()
|
60 |
+
|
61 |
+
# --- DATA PARTY ---
|
62 |
+
if selected == "Data Party":
|
63 |
+
st.write("To be continued... :)")
|
64 |
|
|
|
|
|
65 |
if __name__ == "__main__":
|
66 |
main()
|
data_analyzer.py
CHANGED
@@ -1,13 +1,66 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
2 |
|
3 |
class DataAnalyzer:
|
4 |
def __init__(self, data):
|
5 |
self.data = data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def show_summary_statistics(self):
|
8 |
if st.button('Show Summary Statistics'):
|
9 |
st.write(self.data.describe())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import seaborn as sns
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
|
6 |
class DataAnalyzer:
|
7 |
def __init__(self, data):
|
8 |
self.data = data
|
9 |
+
st.header("Exploratory Data Analysis")
|
10 |
+
|
11 |
+
def show_eda(self):
|
12 |
+
st.write("Number of rows:", self.data.shape[0])
|
13 |
+
st.write("Number of columns:", self.data.shape[1])
|
14 |
+
columns_by_dtype = {}
|
15 |
+
for column_name, dtype in self.data.dtypes.items():
|
16 |
+
dtype_str = str(dtype)
|
17 |
+
if dtype_str not in columns_by_dtype:
|
18 |
+
columns_by_dtype[dtype_str] = [column_name]
|
19 |
+
else:
|
20 |
+
columns_by_dtype[dtype_str].append(column_name)
|
21 |
+
col_type_df = []
|
22 |
+
for dtype, columns in columns_by_dtype.items():
|
23 |
+
col_type_df.append([dtype, ', '.join(columns)])
|
24 |
+
df = pd.DataFrame(col_type_df, columns=["Data Type", "Column Names"])
|
25 |
+
st.subheader("Columns by Data Type")
|
26 |
+
st.dataframe(df, hide_index=True, use_container_width=True)
|
27 |
|
28 |
def show_summary_statistics(self):
|
29 |
if st.button('Show Summary Statistics'):
|
30 |
st.write(self.data.describe())
|
31 |
+
st.write(self.data.describe(include=object))
|
32 |
+
|
33 |
+
def show_null_value_statistics(self):
|
34 |
+
st.subheader("Null Value Statistics")
|
35 |
+
null_counts = self.data.isnull().sum()
|
36 |
+
total_null = null_counts.sum()
|
37 |
+
total_rows = self.data.shape[0]
|
38 |
+
null_percentages = (null_counts / total_rows) * 100
|
39 |
+
null_stats_df = pd.DataFrame({
|
40 |
+
'Column Name': null_counts.index,
|
41 |
+
'Null Values': null_counts.values,
|
42 |
+
'Percentage Null': null_percentages.values
|
43 |
+
})
|
44 |
+
null_stats_df.loc[len(null_stats_df)] = ['Total', total_null, (total_null / (total_rows * self.data.shape[1])) * 100]
|
45 |
+
st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
|
46 |
+
|
47 |
+
def show_count_plots(self):
|
48 |
+
st.subheader("Count Plots")
|
49 |
+
sns.set(style="whitegrid")
|
50 |
+
|
51 |
+
for column_name in self.data.columns:
|
52 |
+
unique_values = self.data[column_name].nunique()
|
53 |
+
|
54 |
+
if unique_values <= 12:
|
55 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
56 |
+
sns.countplot(data=self.data, x=column_name, ax=ax)
|
57 |
+
ax.set_title(f'Count Plot of {column_name}')
|
58 |
+
ax.set_xticklabels(ax.get_xticklabels())
|
59 |
+
st.pyplot(fig)
|
60 |
|
61 |
+
else:
|
62 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
63 |
+
sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
|
64 |
+
ax.set_title(f'Histogram of {column_name}')
|
65 |
+
ax.set_xlabel(column_name)
|
66 |
+
st.pyplot(fig)
|
data_transformer.py
CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
|
|
4 |
class DataTransformer:
|
5 |
def __init__(self, data):
|
6 |
self.data = data
|
|
|
7 |
|
8 |
def perform_column_operation(self):
|
9 |
column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
|
@@ -13,8 +14,39 @@ class DataTransformer:
|
|
13 |
st.write(self.data)
|
14 |
return self.data
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
#transformed data is not retained
|
17 |
#null values handling
|
18 |
#2 options - to remove or to impute that is the question
|
|
|
|
|
|
|
19 |
#give option to analyse the transformed dataset or save it.
|
20 |
|
|
|
4 |
class DataTransformer:
|
5 |
def __init__(self, data):
|
6 |
self.data = data
|
7 |
+
st.header("Data Cleaning")
|
8 |
|
9 |
def perform_column_operation(self):
|
10 |
column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
|
|
|
14 |
st.write(self.data)
|
15 |
return self.data
|
16 |
|
17 |
+
def remove_null(self):
|
18 |
+
if st.button('Remove Null'):
|
19 |
+
col = st.multiselect('Choose columns to remove nulls', self.data.columns)
|
20 |
+
self.data.dropna(subset=col, inplace=True)
|
21 |
+
st.toast("Null values removed")
|
22 |
+
return self.data
|
23 |
+
|
24 |
+
def impute_null(self):
|
25 |
+
if st.button('Impute Null'):
|
26 |
+
col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
|
27 |
+
option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
|
28 |
+
if option == "mean":
|
29 |
+
self.data.fillna(df.mean())
|
30 |
+
elif option == "mode":
|
31 |
+
self.data.fillna(df.mode())
|
32 |
+
elif option == "0":
|
33 |
+
self.data.fillna("0")
|
34 |
+
st.toast("Null values filled")
|
35 |
+
return self.data
|
36 |
+
|
37 |
+
def remove_columns(self):
|
38 |
+
if st.button('Remove Columns'):
|
39 |
+
col = st.multiselect('Choose columns to remove', self.data.columns)
|
40 |
+
self.data.drop(columns=col, inplace=True)
|
41 |
+
st.toast("Columns removed")
|
42 |
+
return self.data
|
43 |
+
|
44 |
+
# PROBLEMS RESOLVED
|
45 |
#transformed data is not retained
|
46 |
#null values handling
|
47 |
#2 options - to remove or to impute that is the question
|
48 |
+
|
49 |
+
# PROBLEMS TO BE ADDRESSED
|
50 |
+
#categorical to numerical
|
51 |
#give option to analyse the transformed dataset or save it.
|
52 |
|
data_visualizer.py
CHANGED
@@ -7,9 +7,11 @@ import seaborn as sns
|
|
7 |
class DataVisualizer:
|
8 |
def __init__(self, data):
|
9 |
self.data = data
|
|
|
10 |
|
11 |
def visualize_data(self):
|
12 |
plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
|
|
|
13 |
if plot_type == 'Histogram':
|
14 |
numeric_columns = self.data.select_dtypes(include=[np.number]).columns
|
15 |
if numeric_columns.empty:
|
@@ -18,7 +20,11 @@ class DataVisualizer:
|
|
18 |
column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
|
19 |
fig, ax = plt.subplots()
|
20 |
ax.hist(self.data[column_to_visualize])
|
|
|
|
|
|
|
21 |
st.pyplot(fig)
|
|
|
22 |
elif plot_type == 'Box Plot':
|
23 |
numeric_columns = self.data.select_dtypes(include=[np.number]).columns
|
24 |
if numeric_columns.empty:
|
@@ -27,23 +33,42 @@ class DataVisualizer:
|
|
27 |
column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
|
28 |
fig, ax = plt.subplots()
|
29 |
ax.boxplot(self.data[column_to_visualize].dropna())
|
|
|
|
|
30 |
st.pyplot(fig)
|
|
|
31 |
elif plot_type == 'Pie Chart':
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
elif plot_type == 'Scatter Plot':
|
37 |
-
|
38 |
-
|
39 |
-
st.
|
|
|
|
|
|
|
|
|
40 |
else:
|
41 |
fig, ax = plt.subplots()
|
42 |
-
ax.scatter(self.data[
|
|
|
|
|
|
|
43 |
st.pyplot(fig)
|
|
|
44 |
elif plot_type == 'Heatmap':
|
45 |
numeric_data = self.data.select_dtypes(include=[np.number])
|
46 |
corr = numeric_data.corr()
|
47 |
fig, ax = plt.subplots()
|
48 |
sns.heatmap(corr, annot=True, ax=ax)
|
|
|
49 |
st.pyplot(fig)
|
|
|
7 |
class DataVisualizer:
|
8 |
def __init__(self, data):
|
9 |
self.data = data
|
10 |
+
st.subheader("Data Visualizer")
|
11 |
|
12 |
def visualize_data(self):
|
13 |
plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
|
14 |
+
|
15 |
if plot_type == 'Histogram':
|
16 |
numeric_columns = self.data.select_dtypes(include=[np.number]).columns
|
17 |
if numeric_columns.empty:
|
|
|
20 |
column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
|
21 |
fig, ax = plt.subplots()
|
22 |
ax.hist(self.data[column_to_visualize])
|
23 |
+
ax.set_title(f'Histogram of {column_to_visualize}')
|
24 |
+
ax.set_xlabel(column_to_visualize)
|
25 |
+
ax.set_ylabel('Frequency')
|
26 |
st.pyplot(fig)
|
27 |
+
|
28 |
elif plot_type == 'Box Plot':
|
29 |
numeric_columns = self.data.select_dtypes(include=[np.number]).columns
|
30 |
if numeric_columns.empty:
|
|
|
33 |
column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
|
34 |
fig, ax = plt.subplots()
|
35 |
ax.boxplot(self.data[column_to_visualize].dropna())
|
36 |
+
ax.set_title(f'Box Plot of {column_to_visualize}')
|
37 |
+
ax.set_ylabel(column_to_visualize)
|
38 |
st.pyplot(fig)
|
39 |
+
|
40 |
elif plot_type == 'Pie Chart':
|
41 |
+
nonnumeric_columns = self.data.select_dtypes(include=['object']).columns
|
42 |
+
if nonnumeric_columns.empty:
|
43 |
+
st.warning('No non numeric columns in the data to visualize.')
|
44 |
+
else:
|
45 |
+
column_to_visualize = st.selectbox('Choose a column to visualize', nonnumeric_columns)
|
46 |
+
fig, ax = plt.subplots()
|
47 |
+
self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
|
48 |
+
ax.set_title(f'Pie Chart of {column_to_visualize}')
|
49 |
+
ax.set_ylabel('')
|
50 |
+
st.pyplot(fig)
|
51 |
+
|
52 |
elif plot_type == 'Scatter Plot':
|
53 |
+
left, right = st.columns(2)
|
54 |
+
with left:
|
55 |
+
x_col = st.selectbox('Choose values on X axis', self.data.select_dtypes(include=[np.number]).columns)
|
56 |
+
with right:
|
57 |
+
y_col = st.selectbox('Choose values on Y axis', self.data.select_dtypes(include=[np.number]).columns)
|
58 |
+
if x_col == y_col:
|
59 |
+
st.warning('Please select two different columns for scatter plot.')
|
60 |
else:
|
61 |
fig, ax = plt.subplots()
|
62 |
+
ax.scatter(self.data[x_col], self.data[y_col])
|
63 |
+
ax.set_title(f'Scatter Plot of {x_col} vs {y_col}')
|
64 |
+
ax.set_xlabel(x_col)
|
65 |
+
ax.set_ylabel(y_col)
|
66 |
st.pyplot(fig)
|
67 |
+
|
68 |
elif plot_type == 'Heatmap':
|
69 |
numeric_data = self.data.select_dtypes(include=[np.number])
|
70 |
corr = numeric_data.corr()
|
71 |
fig, ax = plt.subplots()
|
72 |
sns.heatmap(corr, annot=True, ax=ax)
|
73 |
+
ax.set_title('Correlation Heatmap')
|
74 |
st.pyplot(fig)
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ seaborn
|
|
6 |
langchain-google-genai
|
7 |
langchain-experimental
|
8 |
python-dotenv
|
9 |
-
tabulate
|
|
|
|
6 |
langchain-google-genai
|
7 |
langchain-experimental
|
8 |
python-dotenv
|
9 |
+
tabulate
|
10 |
+
streamlit_option_menu
|
test.py
CHANGED
@@ -1,16 +1,42 @@
|
|
1 |
-
|
2 |
-
from
|
3 |
-
|
4 |
-
from
|
|
|
|
|
|
|
5 |
import os
|
6 |
-
load_dotenv() # take environment variables from .env.
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
csv_agent = create_csv_agent(llm,"data.csv", verbose=True)
|
13 |
-
question = "describe the dataset"
|
14 |
-
response = csv_agent.run(question)
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from data_loader import DataLoader
|
3 |
+
from data_analyzer import DataAnalyzer
|
4 |
+
from data_filter import DataFilter
|
5 |
+
from data_transformer import DataTransformer
|
6 |
+
from data_visualizer import DataVisualizer
|
7 |
+
from data_QA import DataQA
|
8 |
import os
|
|
|
9 |
|
10 |
+
def main():
|
11 |
+
if os.path.exists("data.csv"):
|
12 |
+
os.remove("data.csv")
|
13 |
+
with open("data.csv", 'w'):
|
14 |
+
pass
|
15 |
+
st.title('Insights 📶')
|
16 |
|
17 |
+
data_loader = DataLoader()
|
18 |
+
data = data_loader.load_data()
|
|
|
|
|
|
|
19 |
|
20 |
+
if os.path.getsize("data.csv") != 0:
|
21 |
+
data_analyzer = DataAnalyzer(data)
|
22 |
+
data_analyzer.show_summary_statistics()
|
23 |
+
data_analyzer.show_data_types()
|
24 |
+
data_analyzer.show_null_value_statistics()
|
25 |
+
|
26 |
+
data_filter = DataFilter(data)
|
27 |
+
data = data_filter.filter_rows()
|
28 |
+
|
29 |
+
data_transformer = DataTransformer(data)
|
30 |
+
data = data_transformer.perform_column_operation()
|
31 |
+
data = data_transformer.remove_null()
|
32 |
+
data = data_transformer.impute_null()
|
33 |
+
data = data_transformer.remove_columns()
|
34 |
+
|
35 |
+
data_visualizer = DataVisualizer(data)
|
36 |
+
data_visualizer.visualize_data()
|
37 |
+
|
38 |
+
data_QA = DataQA(data)
|
39 |
+
data_QA.ask_csv()
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
main()
|