Atharva Thakur commited on
Commit
91d7466
•
2 Parent(s): 724bf34 7c20203

Merge pull request #4 from Shrutisd1003/data-transformation

Browse files
Files changed (6) hide show
  1. app.py +46 -11
  2. data_analyzer.py +56 -3
  3. data_transformer.py +32 -0
  4. data_visualizer.py +33 -8
  5. requirements.txt +2 -1
  6. test.py +38 -12
app.py CHANGED
@@ -5,27 +5,62 @@ from data_filter import DataFilter
5
  from data_transformer import DataTransformer
6
  from data_visualizer import DataVisualizer
7
  from data_QA import DataQA
 
 
8
 
9
  def main():
 
 
 
 
10
  st.title('Insights 📶')
11
 
12
  data_loader = DataLoader()
13
  data = data_loader.load_data()
14
 
15
- data_transformer = DataTransformer(data)
16
- modified_data = data_transformer.perform_column_operation()
 
 
 
17
 
18
- data_analyzer = DataAnalyzer(modified_data)
19
- data_analyzer.show_summary_statistics()
20
- data_analyzer.show_data_types()
 
21
 
22
- data_filter = DataFilter(modified_data)
23
- data = data_filter.filter_rows()
 
 
 
 
 
24
 
25
- data_visualizer = DataVisualizer(modified_data)
26
- data_visualizer.visualize_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- data_QA = DataQA(data)
29
- data_QA.ask_csv()
30
  if __name__ == "__main__":
31
  main()
 
5
  from data_transformer import DataTransformer
6
  from data_visualizer import DataVisualizer
7
  from data_QA import DataQA
8
+ import os
9
+ from streamlit_option_menu import option_menu
10
 
11
  def main():
12
+ if os.path.exists("data.csv"):
13
+ os.remove("data.csv")
14
+ with open("data.csv", 'w'):
15
+ pass
16
  st.title('Insights 📶')
17
 
18
  data_loader = DataLoader()
19
  data = data_loader.load_data()
20
 
21
+ if os.path.getsize("data.csv") != 0:
22
+ with st.sidebar:
23
+ selected = option_menu(
24
+ menu_title="Main Menu",
25
+ options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "Data Party"])
26
 
27
+ # --- DATA LOADER ---
28
+ if selected == "Data Loader":
29
+ st.toast("Data Loaded")
30
+ st.write(data.head())
31
 
32
+ # --- EDA ---
33
+ if selected == "Exploratory Data Analysis":
34
+ data_analyzer = DataAnalyzer(data)
35
+ data_analyzer.show_eda()
36
+ data_analyzer.show_null_value_statistics()
37
+ data_analyzer.show_count_plots()
38
+ data_analyzer.show_summary_statistics()
39
 
40
+ data_visualizer = DataVisualizer(data)
41
+ data_visualizer.visualize_data()
42
+
43
+ # --- DATA CLEANING ---
44
+ if selected == "Data Cleaning":
45
+ data_transformer = DataTransformer(data)
46
+ data_analyzer = DataAnalyzer(data)
47
+ modified_data = data_transformer.perform_column_operation()
48
+ data_analyzer.show_null_value_statistics()
49
+ modified_data = data_transformer.remove_null()
50
+ modified_data = data_transformer.impute_null()
51
+ modified_data = data_transformer.remove_columns()
52
+
53
+ # data_filter = DataFilter(modified_data)
54
+ # data = data_filter.filter_rows()
55
+
56
+ # --- QUESTION AND ANSWER ---
57
+ if selected == "Q/A":
58
+ data_QA = DataQA(data)
59
+ data_QA.ask_csv()
60
+
61
+ # --- DATA PARTY ---
62
+ if selected == "Data Party":
63
+ st.write("To be continued... :)")
64
 
 
 
65
  if __name__ == "__main__":
66
  main()
data_analyzer.py CHANGED
@@ -1,13 +1,66 @@
1
  import streamlit as st
 
 
 
2
 
3
  class DataAnalyzer:
4
  def __init__(self, data):
5
  self.data = data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def show_summary_statistics(self):
8
  if st.button('Show Summary Statistics'):
9
  st.write(self.data.describe())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- def show_data_types(self):
12
- if st.button('Show Data Types'):
13
- st.write(self.data.dtypes)
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
 
6
  class DataAnalyzer:
7
  def __init__(self, data):
8
  self.data = data
9
+ st.header("Exploratory Data Analysis")
10
+
11
+ def show_eda(self):
12
+ st.write("Number of rows:", self.data.shape[0])
13
+ st.write("Number of columns:", self.data.shape[1])
14
+ columns_by_dtype = {}
15
+ for column_name, dtype in self.data.dtypes.items():
16
+ dtype_str = str(dtype)
17
+ if dtype_str not in columns_by_dtype:
18
+ columns_by_dtype[dtype_str] = [column_name]
19
+ else:
20
+ columns_by_dtype[dtype_str].append(column_name)
21
+ col_type_df = []
22
+ for dtype, columns in columns_by_dtype.items():
23
+ col_type_df.append([dtype, ', '.join(columns)])
24
+ df = pd.DataFrame(col_type_df, columns=["Data Type", "Column Names"])
25
+ st.subheader("Columns by Data Type")
26
+ st.dataframe(df, hide_index=True, use_container_width=True)
27
 
28
  def show_summary_statistics(self):
29
  if st.button('Show Summary Statistics'):
30
  st.write(self.data.describe())
31
+ st.write(self.data.describe(include=object))
32
+
33
+ def show_null_value_statistics(self):
34
+ st.subheader("Null Value Statistics")
35
+ null_counts = self.data.isnull().sum()
36
+ total_null = null_counts.sum()
37
+ total_rows = self.data.shape[0]
38
+ null_percentages = (null_counts / total_rows) * 100
39
+ null_stats_df = pd.DataFrame({
40
+ 'Column Name': null_counts.index,
41
+ 'Null Values': null_counts.values,
42
+ 'Percentage Null': null_percentages.values
43
+ })
44
+ null_stats_df.loc[len(null_stats_df)] = ['Total', total_null, (total_null / (total_rows * self.data.shape[1])) * 100]
45
+ st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
46
+
47
+ def show_count_plots(self):
48
+ st.subheader("Count Plots")
49
+ sns.set(style="whitegrid")
50
+
51
+ for column_name in self.data.columns:
52
+ unique_values = self.data[column_name].nunique()
53
+
54
+ if unique_values <= 12:
55
+ fig, ax = plt.subplots(figsize=(10, 6))
56
+ sns.countplot(data=self.data, x=column_name, ax=ax)
57
+ ax.set_title(f'Count Plot of {column_name}')
58
+ ax.set_xticklabels(ax.get_xticklabels())
59
+ st.pyplot(fig)
60
 
61
+ else:
62
+ fig, ax = plt.subplots(figsize=(10, 6))
63
+ sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
64
+ ax.set_title(f'Histogram of {column_name}')
65
+ ax.set_xlabel(column_name)
66
+ st.pyplot(fig)
data_transformer.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  class DataTransformer:
5
  def __init__(self, data):
6
  self.data = data
 
7
 
8
  def perform_column_operation(self):
9
  column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
@@ -13,8 +14,39 @@ class DataTransformer:
13
  st.write(self.data)
14
  return self.data
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  #transformed data is not retained
17
  #null values handling
18
  #2 options - to remove or to impute that is the question
 
 
 
19
  #give option to analyse the transformed dataset or save it.
20
 
 
4
  class DataTransformer:
5
  def __init__(self, data):
6
  self.data = data
7
+ st.header("Data Cleaning")
8
 
9
  def perform_column_operation(self):
10
  column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
 
14
  st.write(self.data)
15
  return self.data
16
 
17
+ def remove_null(self):
18
+ if st.button('Remove Null'):
19
+ col = st.multiselect('Choose columns to remove nulls', self.data.columns)
20
+ self.data.dropna(subset=col, inplace=True)
21
+ st.toast("Null values removed")
22
+ return self.data
23
+
24
+ def impute_null(self):
25
+ if st.button('Impute Null'):
26
+ col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
27
+ option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
28
+ if option == "mean":
29
+ self.data.fillna(df.mean())
30
+ elif option == "mode":
31
+ self.data.fillna(df.mode())
32
+ elif option == "0":
33
+ self.data.fillna("0")
34
+ st.toast("Null values filled")
35
+ return self.data
36
+
37
+ def remove_columns(self):
38
+ if st.button('Remove Columns'):
39
+ col = st.multiselect('Choose columns to remove', self.data.columns)
40
+ self.data.drop(columns=col, inplace=True)
41
+ st.toast("Columns removed")
42
+ return self.data
43
+
44
+ # PROBLEMS RESOLVED
45
  #transformed data is not retained
46
  #null values handling
47
  #2 options - to remove or to impute that is the question
48
+
49
+ # PROBLEMS TO BE ADDRESSED
50
+ #categorical to numerical
51
  #give option to analyse the transformed dataset or save it.
52
 
data_visualizer.py CHANGED
@@ -7,9 +7,11 @@ import seaborn as sns
7
  class DataVisualizer:
8
  def __init__(self, data):
9
  self.data = data
 
10
 
11
  def visualize_data(self):
12
  plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
 
13
  if plot_type == 'Histogram':
14
  numeric_columns = self.data.select_dtypes(include=[np.number]).columns
15
  if numeric_columns.empty:
@@ -18,7 +20,11 @@ class DataVisualizer:
18
  column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
19
  fig, ax = plt.subplots()
20
  ax.hist(self.data[column_to_visualize])
 
 
 
21
  st.pyplot(fig)
 
22
  elif plot_type == 'Box Plot':
23
  numeric_columns = self.data.select_dtypes(include=[np.number]).columns
24
  if numeric_columns.empty:
@@ -27,23 +33,42 @@ class DataVisualizer:
27
  column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
28
  fig, ax = plt.subplots()
29
  ax.boxplot(self.data[column_to_visualize].dropna())
 
 
30
  st.pyplot(fig)
 
31
  elif plot_type == 'Pie Chart':
32
- column_to_visualize = st.selectbox('Choose a column to visualize', self.data.select_dtypes(include=['object']).columns)
33
- fig, ax = plt.subplots()
34
- self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
35
- st.pyplot(fig)
 
 
 
 
 
 
 
36
  elif plot_type == 'Scatter Plot':
37
- columns_to_visualize = st.multiselect('Choose two columns to visualize', self.data.select_dtypes(include=[np.number]).columns)
38
- if len(columns_to_visualize) != 2:
39
- st.warning('Please select exactly two columns for scatter plot.')
 
 
 
 
40
  else:
41
  fig, ax = plt.subplots()
42
- ax.scatter(self.data[columns_to_visualize[0]], self.data[columns_to_visualize[1]])
 
 
 
43
  st.pyplot(fig)
 
44
  elif plot_type == 'Heatmap':
45
  numeric_data = self.data.select_dtypes(include=[np.number])
46
  corr = numeric_data.corr()
47
  fig, ax = plt.subplots()
48
  sns.heatmap(corr, annot=True, ax=ax)
 
49
  st.pyplot(fig)
 
7
  class DataVisualizer:
8
  def __init__(self, data):
9
  self.data = data
10
+ st.subheader("Data Visualizer")
11
 
12
  def visualize_data(self):
13
  plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
14
+
15
  if plot_type == 'Histogram':
16
  numeric_columns = self.data.select_dtypes(include=[np.number]).columns
17
  if numeric_columns.empty:
 
20
  column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
21
  fig, ax = plt.subplots()
22
  ax.hist(self.data[column_to_visualize])
23
+ ax.set_title(f'Histogram of {column_to_visualize}')
24
+ ax.set_xlabel(column_to_visualize)
25
+ ax.set_ylabel('Frequency')
26
  st.pyplot(fig)
27
+
28
  elif plot_type == 'Box Plot':
29
  numeric_columns = self.data.select_dtypes(include=[np.number]).columns
30
  if numeric_columns.empty:
 
33
  column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
34
  fig, ax = plt.subplots()
35
  ax.boxplot(self.data[column_to_visualize].dropna())
36
+ ax.set_title(f'Box Plot of {column_to_visualize}')
37
+ ax.set_ylabel(column_to_visualize)
38
  st.pyplot(fig)
39
+
40
  elif plot_type == 'Pie Chart':
41
+ nonnumeric_columns = self.data.select_dtypes(include=['object']).columns
42
+ if nonnumeric_columns.empty:
43
+ st.warning('No non numeric columns in the data to visualize.')
44
+ else:
45
+ column_to_visualize = st.selectbox('Choose a column to visualize', nonnumeric_columns)
46
+ fig, ax = plt.subplots()
47
+ self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
48
+ ax.set_title(f'Pie Chart of {column_to_visualize}')
49
+ ax.set_ylabel('')
50
+ st.pyplot(fig)
51
+
52
  elif plot_type == 'Scatter Plot':
53
+ left, right = st.columns(2)
54
+ with left:
55
+ x_col = st.selectbox('Choose values on X axis', self.data.select_dtypes(include=[np.number]).columns)
56
+ with right:
57
+ y_col = st.selectbox('Choose values on Y axis', self.data.select_dtypes(include=[np.number]).columns)
58
+ if x_col == y_col:
59
+ st.warning('Please select two different columns for scatter plot.')
60
  else:
61
  fig, ax = plt.subplots()
62
+ ax.scatter(self.data[x_col], self.data[y_col])
63
+ ax.set_title(f'Scatter Plot of {x_col} vs {y_col}')
64
+ ax.set_xlabel(x_col)
65
+ ax.set_ylabel(y_col)
66
  st.pyplot(fig)
67
+
68
  elif plot_type == 'Heatmap':
69
  numeric_data = self.data.select_dtypes(include=[np.number])
70
  corr = numeric_data.corr()
71
  fig, ax = plt.subplots()
72
  sns.heatmap(corr, annot=True, ax=ax)
73
+ ax.set_title('Correlation Heatmap')
74
  st.pyplot(fig)
requirements.txt CHANGED
@@ -6,4 +6,5 @@ seaborn
6
  langchain-google-genai
7
  langchain-experimental
8
  python-dotenv
9
- tabulate
 
 
6
  langchain-google-genai
7
  langchain-experimental
8
  python-dotenv
9
+ tabulate
10
+ streamlit_option_menu
test.py CHANGED
@@ -1,16 +1,42 @@
1
- from langchain_google_genai import GoogleGenerativeAI
2
- from langchain_experimental.agents import create_csv_agent
3
- import pandas as pd
4
- from dotenv import load_dotenv
 
 
 
5
  import os
6
- load_dotenv() # take environment variables from .env.
7
 
8
- GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 
 
 
 
 
9
 
10
- data = pd.read_csv("data.csv")
11
- llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
12
- csv_agent = create_csv_agent(llm,"data.csv", verbose=True)
13
- question = "describe the dataset"
14
- response = csv_agent.run(question)
15
 
16
- print(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from data_loader import DataLoader
3
+ from data_analyzer import DataAnalyzer
4
+ from data_filter import DataFilter
5
+ from data_transformer import DataTransformer
6
+ from data_visualizer import DataVisualizer
7
+ from data_QA import DataQA
8
  import os
 
9
 
10
+ def main():
11
+ if os.path.exists("data.csv"):
12
+ os.remove("data.csv")
13
+ with open("data.csv", 'w'):
14
+ pass
15
+ st.title('Insights 📶')
16
 
17
+ data_loader = DataLoader()
18
+ data = data_loader.load_data()
 
 
 
19
 
20
+ if os.path.getsize("data.csv") != 0:
21
+ data_analyzer = DataAnalyzer(data)
22
+ data_analyzer.show_summary_statistics()
23
+ data_analyzer.show_data_types()
24
+ data_analyzer.show_null_value_statistics()
25
+
26
+ data_filter = DataFilter(data)
27
+ data = data_filter.filter_rows()
28
+
29
+ data_transformer = DataTransformer(data)
30
+ data = data_transformer.perform_column_operation()
31
+ data = data_transformer.remove_null()
32
+ data = data_transformer.impute_null()
33
+ data = data_transformer.remove_columns()
34
+
35
+ data_visualizer = DataVisualizer(data)
36
+ data_visualizer.visualize_data()
37
+
38
+ data_QA = DataQA(data)
39
+ data_QA.ask_csv()
40
+
41
+ if __name__ == "__main__":
42
+ main()