Atharva Thakur commited on
Commit
327dc11
2 Parent(s): e3fe4bf 843ea16

Merge remote-tracking branch 'origin/main' into LLMdataparty

Browse files
.github/workflows/StreamlitTesting.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Python application
2
+
3
+ on:
4
+ push:
5
+ branches: [ "main" ]
6
+ pull_request:
7
+
8
+ permissions:
9
+ contents: read
10
+
11
+ jobs:
12
+ build:
13
+
14
+ runs-on: ubuntu-latest
15
+
16
+ steps:
17
+ - uses: actions/checkout@v3
18
+ - name: Set up Python 3.10
19
+ uses: actions/setup-python@v3
20
+ with:
21
+ python-version: "3.10"
22
+ - name: Install dependencies
23
+ run: |
24
+ python -m pip install --upgrade pip
25
+ pip install flake8 pytest
26
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
27
+ - name: Set up environment variables
28
+ run: |
29
+ echo "GOOGLE_API_KEY=${{ secrets.GOOGLE_API_KEY }}" >> $GITHUB_ENV
30
+ - name: Lint with flake8
31
+ run: |
32
+ # stop the build if there are Python syntax errors or undefined names
33
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
34
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
35
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
36
+ - name: Test with pytest
37
+ run: |
38
+ pytest -vv
.gitignore CHANGED
@@ -9,6 +9,8 @@ __pycache__/
9
 
10
  # data set
11
  data.csv
 
 
12
  #Env variables
13
  .env
14
  # Distribution / packaging
 
9
 
10
  # data set
11
  data.csv
12
+ original_data.csv
13
+
14
  #Env variables
15
  .env
16
  # Distribution / packaging
Experiment.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ # Function to upload dataset
6
+ def upload_dataset():
7
+ uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
8
+ if uploaded_file is not None:
9
+ df = pd.read_csv(uploaded_file)
10
+ return df
11
+
12
+ # Function to impute null values
13
+ def impute_null(df):
14
+ # Implement your logic for null value imputation
15
+ col = st.multiselect('Choose columns to impute nulls', df.select_dtypes(include=[np.number]).columns)
16
+ option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
17
+ if st.button('Impute Null'):
18
+ if option == "mean":
19
+ df[col] = df[col].fillna(df[col].mean())
20
+ elif option == "mode":
21
+ df[col] = df[col].fillna(df[col].mode().iloc[0]) # mode() returns a DataFrame, so we select the first row
22
+ elif option == "0":
23
+ df[col] = df[col].fillna(0)
24
+ st.success("Null values filled")
25
+ return df
26
+
27
+ # Function to display transformed data
28
+ def display_data(df):
29
+ st.write(df)
30
+
31
+ def main():
32
+ st.title("Data Transformation App")
33
+
34
+ # Step 1: Upload Dataset
35
+ st.sidebar.title("Upload Dataset")
36
+ df = upload_dataset()
37
+
38
+ if df is not None:
39
+ # Step 2: Perform Data Transformation
40
+ st.sidebar.title("Data Transformation")
41
+ if st.sidebar.button("Impute Null Values"):
42
+ df = impute_null(df)
43
+ st.success("Null values imputed successfully!")
44
+
45
+ # Step 3: Display Transformed Data
46
+ st.sidebar.title("Transformed Data")
47
+ if st.sidebar.checkbox("Show Transformed Data"):
48
+ display_data(df)
49
+
50
+ # Step 4: Store Transformed Data
51
+ # You can store the transformed data in a variable or a data structure here
52
+
53
+ # Step 5: Use Transformed Data
54
+ # You can utilize the transformed data for further analysis, visualization, etc.
55
+
56
+ if __name__ == "__main__":
57
+ main()
test.py → Experiments.py RENAMED
File without changes
app.py CHANGED
@@ -5,27 +5,72 @@ from data_filter import DataFilter
5
  from data_transformer import DataTransformer
6
  from data_visualizer import DataVisualizer
7
  from data_QA import DataQA
 
 
 
 
 
8
 
9
  def main():
10
  st.title('Insights 📶')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- data_loader = DataLoader()
13
- data,uploaded_file = data_loader.load_data()
14
 
15
- data_analyzer = DataAnalyzer(data)
16
- data_analyzer.show_summary_statistics()
17
- data_analyzer.show_data_types()
 
 
 
 
 
 
 
 
 
18
 
19
- data_filter = DataFilter(data)
20
- data = data_filter.filter_rows()
 
 
21
 
22
- data_transformer = DataTransformer(data)
23
- data = data_transformer.perform_column_operation()
 
 
24
 
25
- data_visualizer = DataVisualizer(data)
26
- data_visualizer.visualize_data()
 
27
 
28
- data_QA = DataQA(uploaded_file)
29
- data_QA.ask_csv()
30
  if __name__ == "__main__":
31
  main()
 
5
  from data_transformer import DataTransformer
6
  from data_visualizer import DataVisualizer
7
  from data_QA import DataQA
8
+ import os
9
+ from streamlit_option_menu import option_menu
10
+
11
+
12
+ import pandas as pd
13
 
14
  def main():
15
  st.title('Insights 📶')
16
+ data = pd.DataFrame()
17
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
18
+ if st.button('Load Data'):
19
+ data_loader = DataLoader()
20
+ data_loader.load_data(uploaded_file)
21
+ try:
22
+ data = pd.read_csv("data.csv")
23
+ except:
24
+ st.write("Please upload a csv file")
25
+ if os.path.getsize("data.csv") != 0:
26
+ with st.sidebar:
27
+ selected = option_menu(
28
+ menu_title="Main Menu",
29
+ options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "Data Party"])
30
+
31
+ # --- DATA LOADER ---
32
+ if selected == "Data Loader":
33
+ st.toast("Data Loaded")
34
+ st.write(data.head())
35
+
36
+ # --- EDA ---
37
+ if selected == "Exploratory Data Analysis":
38
+ data = pd.read_csv("data.csv")
39
+ data_analyzer = DataAnalyzer(data)
40
+ data_analyzer.show_eda()
41
+ data_analyzer.show_null_value_statistics()
42
+ data_analyzer.show_count_plots()
43
+ data_analyzer.show_summary_statistics()
44
 
45
+ data_visualizer = DataVisualizer(data)
46
+ data_visualizer.visualize_data()
47
 
48
+ # --- DATA CLEANING ---
49
+ if selected == "Data Cleaning":
50
+ data_transformer = DataTransformer(data)
51
+
52
+ modified_data = data_transformer.perform_column_operation()
53
+ modified_data = data_transformer.remove_null()
54
+ modified_data = data_transformer.impute_null()
55
+ data = modified_data
56
+ data_analyzer = DataAnalyzer(data)
57
+ data_analyzer.show_null_value_statistics()
58
+ new_data_analyzer = DataAnalyzer(modified_data)
59
+ data_analyzer.show_null_value_statistics()
60
 
61
+ # modified_data = data_transformer.remove_columns()
62
+
63
+ # data_filter = DataFilter(modified_data)
64
+ # data = data_filter.filter_rows()
65
 
66
+ # --- QUESTION AND ANSWER ---
67
+ if selected == "Q/A":
68
+ data_QA = DataQA(data)
69
+ data_QA.ask_csv()
70
 
71
+ # --- DATA PARTY ---
72
+ if selected == "Data Party":
73
+ st.write("To be continued... :)")
74
 
 
 
75
  if __name__ == "__main__":
76
  main()
data.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Name,Age,Email
2
+ John Doe,30,johndoe@example.com
3
+ Jane Smith,25,janesmith@example.com
4
+ Michael Johnson,35,michaeljohnson@example.com
5
+ Emily Brown,28,emilybrown@example.com
data_analyzer.py CHANGED
@@ -1,13 +1,66 @@
1
  import streamlit as st
 
 
 
2
 
3
  class DataAnalyzer:
4
  def __init__(self, data):
5
  self.data = data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def show_summary_statistics(self):
8
  if st.button('Show Summary Statistics'):
9
  st.write(self.data.describe())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- def show_data_types(self):
12
- if st.button('Show Data Types'):
13
- st.write(self.data.dtypes)
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
 
6
  class DataAnalyzer:
7
  def __init__(self, data):
8
  self.data = data
9
+ st.header("Exploratory Data Analysis")
10
+
11
+ def show_eda(self):
12
+ st.write("Number of rows:", self.data.shape[0])
13
+ st.write("Number of columns:", self.data.shape[1])
14
+ columns_by_dtype = {}
15
+ for column_name, dtype in self.data.dtypes.items():
16
+ dtype_str = str(dtype)
17
+ if dtype_str not in columns_by_dtype:
18
+ columns_by_dtype[dtype_str] = [column_name]
19
+ else:
20
+ columns_by_dtype[dtype_str].append(column_name)
21
+ col_type_df = []
22
+ for dtype, columns in columns_by_dtype.items():
23
+ col_type_df.append([dtype, ', '.join(columns)])
24
+ df = pd.DataFrame(col_type_df, columns=["Data Type", "Column Names"])
25
+ st.subheader("Columns by Data Type")
26
+ st.dataframe(df, hide_index=True, use_container_width=True)
27
 
28
  def show_summary_statistics(self):
29
  if st.button('Show Summary Statistics'):
30
  st.write(self.data.describe())
31
+ st.write(self.data.describe(include=object))
32
+
33
+ def show_null_value_statistics(self):
34
+ st.subheader("Null Value Statistics")
35
+ null_counts = self.data.isnull().sum()
36
+ total_null = null_counts.sum()
37
+ total_rows = self.data.shape[0]
38
+ null_percentages = (null_counts / total_rows) * 100
39
+ null_stats_df = pd.DataFrame({
40
+ 'Column Name': null_counts.index,
41
+ 'Null Values': null_counts.values,
42
+ 'Percentage Null': null_percentages.values
43
+ })
44
+ null_stats_df.loc[len(null_stats_df)] = ['Total', total_null, (total_null / (total_rows * self.data.shape[1])) * 100]
45
+ st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
46
+
47
+ def show_count_plots(self):
48
+ st.subheader("Count Plots")
49
+ sns.set(style="whitegrid")
50
+
51
+ for column_name in self.data.columns:
52
+ unique_values = self.data[column_name].nunique()
53
+
54
+ if unique_values <= 12:
55
+ fig, ax = plt.subplots(figsize=(10, 6))
56
+ sns.countplot(data=self.data, x=column_name, ax=ax)
57
+ ax.set_title(f'Count Plot of {column_name}')
58
+ ax.set_xticklabels(ax.get_xticklabels())
59
+ st.pyplot(fig)
60
 
61
+ else:
62
+ fig, ax = plt.subplots(figsize=(10, 6))
63
+ sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
64
+ ax.set_title(f'Histogram of {column_name}')
65
+ ax.set_xlabel(column_name)
66
+ st.pyplot(fig)
data_loader.py CHANGED
@@ -3,21 +3,15 @@ import pandas as pd
3
 
4
  class DataLoader:
5
  def __init__(self):
6
- self.data = pd.DataFrame() # Initialize data as an empty DataFrame
7
 
8
- def load_data(self):
9
- data_source = st.selectbox('Select data source', ['Upload a CSV file', 'Input a URL'])
10
- if data_source == 'Upload a CSV file':
11
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
12
  if uploaded_file is not None:
13
- self.data = pd.read_csv(uploaded_file)
14
- file_path = './data.csv'
15
- self.data.to_csv(file_path, index=False)
16
- elif data_source == 'Input a URL':
17
- url = st.text_input('Enter the URL of a CSV file')
18
- if url:
19
- try:
20
- self.data = pd.read_csv(url)
21
- except:
22
- st.error('Could not load data from the provided URL. Please make sure the URL is correct and points to a CSV file.')
23
- return self.data,uploaded_file
 
3
 
4
  class DataLoader:
5
  def __init__(self):
6
+ pass
7
 
8
+ @st.cache_data(experimental_allow_widgets=True)
9
+ def load_data(_,uploaded_file):
10
+ if True:
11
+ data = pd.DataFrame()
12
  if uploaded_file is not None:
13
+ data = pd.read_csv(uploaded_file)
14
+ data.to_csv('./original_data.csv', index=False)
15
+ data.to_csv('./data.csv',index=False)
16
+ print("data loader ran once")
17
+ return True
 
 
 
 
 
 
data_transformer.py CHANGED
@@ -1,9 +1,12 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
 
4
  class DataTransformer:
5
  def __init__(self, data):
6
  self.data = data
 
 
7
 
8
  def perform_column_operation(self):
9
  column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
@@ -13,8 +16,43 @@ class DataTransformer:
13
  st.write(self.data)
14
  return self.data
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  #transformed data is not retained
17
  #null values handling
18
  #2 options - to remove or to impute that is the question
 
 
 
19
  #give option to analyse the transformed dataset or save it.
20
 
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import numpy as np
4
 
5
  class DataTransformer:
6
  def __init__(self, data):
7
  self.data = data
8
+ st.header("Data Cleaning")
9
+ st.divider()
10
 
11
  def perform_column_operation(self):
12
  column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
 
16
  st.write(self.data)
17
  return self.data
18
 
19
+ def remove_null(self):
20
+ st.header("Remove Null Values")
21
+ col = st.multiselect('Choose columns to remove nulls', self.data.columns)
22
+ if st.button('Remove Null'):
23
+ self.data.dropna(subset=col, inplace=True)
24
+ st.success("Null values removed")
25
+ return self.data
26
+
27
+ def impute_null(self):
28
+ st.header("Impute Null Values")
29
+ col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
30
+ option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
31
+ if st.button('Impute Null'):
32
+ if option == "mean":
33
+ self.data[col] = self.data[col].fillna(self.data[col].mean())
34
+ elif option == "mode":
35
+ self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0]) # mode() returns a DataFrame, so we select the first row
36
+ elif option == "0":
37
+ self.data[col] = self.data[col].fillna(0)
38
+ st.success("Null values filled")
39
+ self.data.to_csv("data.csv", index=False)
40
+ return self.data
41
+
42
+ def remove_columns(self):
43
+ st.header("Remove Columns")
44
+ col = st.multiselect('Choose columns to remove', self.data.columns)
45
+ if st.button('Remove Columns'):
46
+ self.data.drop(columns=col, inplace=True)
47
+ st.success("Columns removed")
48
+ return self.data
49
+
50
+ # PROBLEMS RESOLVED
51
  #transformed data is not retained
52
  #null values handling
53
  #2 options - to remove or to impute that is the question
54
+
55
+ # PROBLEMS TO BE ADDRESSED
56
+ #categorical to numerical
57
  #give option to analyse the transformed dataset or save it.
58
 
data_visualizer.py CHANGED
@@ -7,9 +7,11 @@ import seaborn as sns
7
  class DataVisualizer:
8
  def __init__(self, data):
9
  self.data = data
 
10
 
11
  def visualize_data(self):
12
  plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
 
13
  if plot_type == 'Histogram':
14
  numeric_columns = self.data.select_dtypes(include=[np.number]).columns
15
  if numeric_columns.empty:
@@ -18,7 +20,11 @@ class DataVisualizer:
18
  column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
19
  fig, ax = plt.subplots()
20
  ax.hist(self.data[column_to_visualize])
 
 
 
21
  st.pyplot(fig)
 
22
  elif plot_type == 'Box Plot':
23
  numeric_columns = self.data.select_dtypes(include=[np.number]).columns
24
  if numeric_columns.empty:
@@ -27,23 +33,42 @@ class DataVisualizer:
27
  column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
28
  fig, ax = plt.subplots()
29
  ax.boxplot(self.data[column_to_visualize].dropna())
 
 
30
  st.pyplot(fig)
 
31
  elif plot_type == 'Pie Chart':
32
- column_to_visualize = st.selectbox('Choose a column to visualize', self.data.select_dtypes(include=['object']).columns)
33
- fig, ax = plt.subplots()
34
- self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
35
- st.pyplot(fig)
 
 
 
 
 
 
 
36
  elif plot_type == 'Scatter Plot':
37
- columns_to_visualize = st.multiselect('Choose two columns to visualize', self.data.select_dtypes(include=[np.number]).columns)
38
- if len(columns_to_visualize) != 2:
39
- st.warning('Please select exactly two columns for scatter plot.')
 
 
 
 
40
  else:
41
  fig, ax = plt.subplots()
42
- ax.scatter(self.data[columns_to_visualize[0]], self.data[columns_to_visualize[1]])
 
 
 
43
  st.pyplot(fig)
 
44
  elif plot_type == 'Heatmap':
45
  numeric_data = self.data.select_dtypes(include=[np.number])
46
  corr = numeric_data.corr()
47
  fig, ax = plt.subplots()
48
  sns.heatmap(corr, annot=True, ax=ax)
 
49
  st.pyplot(fig)
 
7
  class DataVisualizer:
8
  def __init__(self, data):
9
  self.data = data
10
+ st.subheader("Data Visualizer")
11
 
12
  def visualize_data(self):
13
  plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
14
+
15
  if plot_type == 'Histogram':
16
  numeric_columns = self.data.select_dtypes(include=[np.number]).columns
17
  if numeric_columns.empty:
 
20
  column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
21
  fig, ax = plt.subplots()
22
  ax.hist(self.data[column_to_visualize])
23
+ ax.set_title(f'Histogram of {column_to_visualize}')
24
+ ax.set_xlabel(column_to_visualize)
25
+ ax.set_ylabel('Frequency')
26
  st.pyplot(fig)
27
+
28
  elif plot_type == 'Box Plot':
29
  numeric_columns = self.data.select_dtypes(include=[np.number]).columns
30
  if numeric_columns.empty:
 
33
  column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
34
  fig, ax = plt.subplots()
35
  ax.boxplot(self.data[column_to_visualize].dropna())
36
+ ax.set_title(f'Box Plot of {column_to_visualize}')
37
+ ax.set_ylabel(column_to_visualize)
38
  st.pyplot(fig)
39
+
40
  elif plot_type == 'Pie Chart':
41
+ nonnumeric_columns = self.data.select_dtypes(include=['object']).columns
42
+ if nonnumeric_columns.empty:
43
+ st.warning('No non numeric columns in the data to visualize.')
44
+ else:
45
+ column_to_visualize = st.selectbox('Choose a column to visualize', nonnumeric_columns)
46
+ fig, ax = plt.subplots()
47
+ self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
48
+ ax.set_title(f'Pie Chart of {column_to_visualize}')
49
+ ax.set_ylabel('')
50
+ st.pyplot(fig)
51
+
52
  elif plot_type == 'Scatter Plot':
53
+ left, right = st.columns(2)
54
+ with left:
55
+ x_col = st.selectbox('Choose values on X axis', self.data.select_dtypes(include=[np.number]).columns)
56
+ with right:
57
+ y_col = st.selectbox('Choose values on Y axis', self.data.select_dtypes(include=[np.number]).columns)
58
+ if x_col == y_col:
59
+ st.warning('Please select two different columns for scatter plot.')
60
  else:
61
  fig, ax = plt.subplots()
62
+ ax.scatter(self.data[x_col], self.data[y_col])
63
+ ax.set_title(f'Scatter Plot of {x_col} vs {y_col}')
64
+ ax.set_xlabel(x_col)
65
+ ax.set_ylabel(y_col)
66
  st.pyplot(fig)
67
+
68
  elif plot_type == 'Heatmap':
69
  numeric_data = self.data.select_dtypes(include=[np.number])
70
  corr = numeric_data.corr()
71
  fig, ax = plt.subplots()
72
  sns.heatmap(corr, annot=True, ax=ax)
73
+ ax.set_title('Correlation Heatmap')
74
  st.pyplot(fig)
requirements.txt CHANGED
@@ -7,4 +7,7 @@ langchain-google-genai
7
  langchain-experimental
8
  python-dotenv
9
  tabulate
10
- litellm
 
 
 
 
7
  langchain-experimental
8
  python-dotenv
9
  tabulate
10
+ litellm
11
+ streamlit_option_menu
12
+ pytest
13
+
test_app.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from streamlit.testing.v1 import AppTest
2
+
3
+ def test_smoke():
4
+ """Basic smoke test"""
5
+ at = AppTest.from_file("app.py", default_timeout=10).run()
6
+ # Supported elements are primarily exposed as properties on the script
7
+ # results object, which returns a sequence of that element.
8
+ assert not at.exception
9
+