Spaces:

AtharvaThakur
/

Insights

Sleeping

App Files Files Community

Atharva Thakur commited on Apr 10

Commit

327dc11

•

2 Parent(s): e3fe4bf 843ea16

Merge remote-tracking branch 'origin/main' into LLMdataparty

Browse files

Files changed (12) hide show

.github/workflows/StreamlitTesting.yml +38 -0
.gitignore +2 -0
Experiment.py +57 -0
test.py → Experiments.py +0 -0
app.py +58 -13
data.csv +5 -0
data_analyzer.py +56 -3
data_loader.py +10 -16
data_transformer.py +38 -0
data_visualizer.py +33 -8
requirements.txt +4 -1
test_app.py +9 -0

.github/workflows/StreamlitTesting.yml ADDED Viewed

	@@ -0,0 +1,38 @@

+name: Python application
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+permissions:
+  contents: read
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Set up environment variables
+      run: |
+        echo "GOOGLE_API_KEY=${{ secrets.GOOGLE_API_KEY }}" >> $GITHUB_ENV
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest -vv

.gitignore CHANGED Viewed

@@ -9,6 +9,8 @@ __pycache__/
 # data set
 data.csv
 #Env variables
 .env
 # Distribution / packaging

 # data set
 data.csv
+original_data.csv
 #Env variables
 .env
 # Distribution / packaging

Experiment.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+# Function to upload dataset
+def upload_dataset():
+    uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
+    if uploaded_file is not None:
+        df = pd.read_csv(uploaded_file)
+        return df
+# Function to impute null values
+def impute_null(df):
+    # Implement your logic for null value imputation
+    col = st.multiselect('Choose columns to impute nulls', df.select_dtypes(include=[np.number]).columns)
+    option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
+    if st.button('Impute Null'):
+        if option == "mean":
+                df[col] = df[col].fillna(df[col].mean())
+        elif option == "mode":
+                df[col] = df[col].fillna(df[col].mode().iloc[0])  # mode() returns a DataFrame, so we select the first row
+        elif option == "0":
+                df[col] = df[col].fillna(0)
+        st.success("Null values filled")
+    return df
+# Function to display transformed data
+def display_data(df):
+    st.write(df)
+def main():
+    st.title("Data Transformation App")
+    # Step 1: Upload Dataset
+    st.sidebar.title("Upload Dataset")
+    df = upload_dataset()
+    if df is not None:
+        # Step 2: Perform Data Transformation
+        st.sidebar.title("Data Transformation")
+        if st.sidebar.button("Impute Null Values"):
+            df = impute_null(df)
+            st.success("Null values imputed successfully!")
+        # Step 3: Display Transformed Data
+        st.sidebar.title("Transformed Data")
+        if st.sidebar.checkbox("Show Transformed Data"):
+            display_data(df)
+        # Step 4: Store Transformed Data
+        # You can store the transformed data in a variable or a data structure here
+    # Step 5: Use Transformed Data
+    # You can utilize the transformed data for further analysis, visualization, etc.
+if __name__ == "__main__":
+    main()

test.py → Experiments.py RENAMED Viewed

File without changes

app.py CHANGED Viewed

@@ -5,27 +5,72 @@ from data_filter import DataFilter
 from data_transformer import DataTransformer
 from data_visualizer import DataVisualizer
 from data_QA import DataQA
 def main():
     st.title('Insights 📶')
-    data_loader = DataLoader()
-    data,uploaded_file = data_loader.load_data()
-    data_analyzer = DataAnalyzer(data)
-    data_analyzer.show_summary_statistics()
-    data_analyzer.show_data_types()
-    data_filter = DataFilter(data)
-    data = data_filter.filter_rows()
-    data_transformer = DataTransformer(data)
-    data = data_transformer.perform_column_operation()
-    data_visualizer = DataVisualizer(data)
-    data_visualizer.visualize_data()
-    data_QA = DataQA(uploaded_file)
-    data_QA.ask_csv()
 if __name__ == "__main__":
     main()

 from data_transformer import DataTransformer
 from data_visualizer import DataVisualizer
 from data_QA import DataQA
+import os
+from streamlit_option_menu import option_menu
+import pandas as pd
 def main():
     st.title('Insights 📶')
+    data = pd.DataFrame()
+    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+    if st.button('Load Data'):
+        data_loader = DataLoader()
+        data_loader.load_data(uploaded_file)
+    try:
+        data = pd.read_csv("data.csv")
+    except:
+        st.write("Please upload a csv file")
+    if os.path.getsize("data.csv") != 0:
+        with st.sidebar:
+            selected = option_menu(
+                menu_title="Main Menu",
+                options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "Data Party"])
+        # --- DATA LOADER ---
+        if selected == "Data Loader":
+            st.toast("Data Loaded")
+            st.write(data.head())
+        # --- EDA ---
+        if selected == "Exploratory Data Analysis":
+            data = pd.read_csv("data.csv")
+            data_analyzer = DataAnalyzer(data)
+            data_analyzer.show_eda()
+            data_analyzer.show_null_value_statistics()
+            data_analyzer.show_count_plots()
+            data_analyzer.show_summary_statistics()
+            data_visualizer = DataVisualizer(data)
+            data_visualizer.visualize_data()
+        # --- DATA CLEANING ---
+        if selected == "Data Cleaning":
+            data_transformer = DataTransformer(data)
+            modified_data = data_transformer.perform_column_operation()
+            modified_data = data_transformer.remove_null()
+            modified_data = data_transformer.impute_null()
+            data = modified_data
+            data_analyzer = DataAnalyzer(data)
+            data_analyzer.show_null_value_statistics()
+            new_data_analyzer = DataAnalyzer(modified_data)
+            data_analyzer.show_null_value_statistics()
+            # modified_data = data_transformer.remove_columns()
+            # data_filter = DataFilter(modified_data)
+            # data = data_filter.filter_rows()
+        # --- QUESTION AND ANSWER ---
+        if selected == "Q/A":
+            data_QA = DataQA(data)
+            data_QA.ask_csv()
+        # --- DATA PARTY ---
+        if selected == "Data Party":
+            st.write("To be continued... :)")
 if __name__ == "__main__":
     main()

data.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+Name,Age,Email
+John Doe,30,johndoe@example.com
+Jane Smith,25,janesmith@example.com
+Michael Johnson,35,michaeljohnson@example.com
+Emily Brown,28,emilybrown@example.com

data_analyzer.py CHANGED Viewed

@@ -1,13 +1,66 @@
 import streamlit as st
 class DataAnalyzer:
     def __init__(self, data):
         self.data = data
     def show_summary_statistics(self):
         if st.button('Show Summary Statistics'):
             st.write(self.data.describe())
-    def show_data_types(self):
-        if st.button('Show Data Types'):
-            st.write(self.data.dtypes)

 import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
 class DataAnalyzer:
     def __init__(self, data):
         self.data = data
+        st.header("Exploratory Data Analysis")
+    def show_eda(self):
+        st.write("Number of rows:", self.data.shape[0])
+        st.write("Number of columns:", self.data.shape[1])
+        columns_by_dtype = {}
+        for column_name, dtype in self.data.dtypes.items():
+            dtype_str = str(dtype)
+            if dtype_str not in columns_by_dtype:
+                columns_by_dtype[dtype_str] = [column_name]
+            else:
+                columns_by_dtype[dtype_str].append(column_name)
+        col_type_df = []
+        for dtype, columns in columns_by_dtype.items():
+            col_type_df.append([dtype, ', '.join(columns)])
+        df = pd.DataFrame(col_type_df, columns=["Data Type", "Column Names"])
+        st.subheader("Columns by Data Type")
+        st.dataframe(df, hide_index=True, use_container_width=True)
     def show_summary_statistics(self):
         if st.button('Show Summary Statistics'):
             st.write(self.data.describe())
+            st.write(self.data.describe(include=object))
+    def show_null_value_statistics(self):
+        st.subheader("Null Value Statistics")
+        null_counts = self.data.isnull().sum()
+        total_null = null_counts.sum()
+        total_rows = self.data.shape[0]
+        null_percentages = (null_counts / total_rows) * 100
+        null_stats_df = pd.DataFrame({
+            'Column Name': null_counts.index,
+            'Null Values': null_counts.values,
+            'Percentage Null': null_percentages.values
+        })
+        null_stats_df.loc[len(null_stats_df)] = ['Total', total_null, (total_null / (total_rows * self.data.shape[1])) * 100]
+        st.dataframe(null_stats_df, hide_index=True, use_container_width=True)
+    def show_count_plots(self):
+        st.subheader("Count Plots")
+        sns.set(style="whitegrid")
+        for column_name in self.data.columns:
+            unique_values = self.data[column_name].nunique()
+            if unique_values <= 12:
+                fig, ax = plt.subplots(figsize=(10, 6))
+                sns.countplot(data=self.data, x=column_name, ax=ax)
+                ax.set_title(f'Count Plot of {column_name}')
+                ax.set_xticklabels(ax.get_xticklabels())
+                st.pyplot(fig)
+            else:
+                fig, ax = plt.subplots(figsize=(10, 6))
+                sns.histplot(data=self.data, x=column_name, bins=20, ax=ax)
+                ax.set_title(f'Histogram of {column_name}')
+                ax.set_xlabel(column_name)
+                st.pyplot(fig)

data_loader.py CHANGED Viewed

@@ -3,21 +3,15 @@ import pandas as pd
 class DataLoader:
     def __init__(self):
-        self.data = pd.DataFrame()  # Initialize data as an empty DataFrame
-    def load_data(self):
-        data_source = st.selectbox('Select data source', ['Upload a CSV file', 'Input a URL'])
-        if data_source == 'Upload a CSV file':
-            uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
             if uploaded_file is not None:
-                self.data = pd.read_csv(uploaded_file)
-                file_path = './data.csv'
-                self.data.to_csv(file_path, index=False)
-        elif data_source == 'Input a URL':
-            url = st.text_input('Enter the URL of a CSV file')
-            if url:
-                try:
-                    self.data = pd.read_csv(url)
-                except:
-                    st.error('Could not load data from the provided URL. Please make sure the URL is correct and points to a CSV file.')
-        return self.data,uploaded_file

 class DataLoader:
     def __init__(self):
+        pass
+    @st.cache_data(experimental_allow_widgets=True)
+    def load_data(_,uploaded_file):
+        if True:
+            data = pd.DataFrame()
             if uploaded_file is not None:
+                    data = pd.read_csv(uploaded_file)
+                    data.to_csv('./original_data.csv', index=False)
+                    data.to_csv('./data.csv',index=False)
+            print("data loader ran once")
+        return True

data_transformer.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import streamlit as st
 import pandas as pd
 class DataTransformer:
     def __init__(self, data):
         self.data = data
     def perform_column_operation(self):
         column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
@@ -13,8 +16,43 @@ class DataTransformer:
             st.write(self.data)
         return self.data
         #transformed data is not retained
         #null values handling
         #2 options - to remove or to impute that is the question
         #give option to analyse the transformed dataset or save it.

 import streamlit as st
 import pandas as pd
+import numpy as np
 class DataTransformer:
     def __init__(self, data):
         self.data = data
+        st.header("Data Cleaning")
+        st.divider()
     def perform_column_operation(self):
         column_operation = st.sidebar.text_input('Column operation (e.g., age * 2)')
             st.write(self.data)
         return self.data
+    def remove_null(self):
+        st.header("Remove Null Values")
+        col = st.multiselect('Choose columns to remove nulls', self.data.columns)
+        if st.button('Remove Null'):
+            self.data.dropna(subset=col, inplace=True)
+            st.success("Null values removed")
+        return self.data
+    def impute_null(self):
+        st.header("Impute Null Values")
+        col = st.multiselect('Choose columns to impute nulls', self.data.select_dtypes(include=[np.number]).columns)
+        option = st.selectbox('Impute nulls with', ('mean', 'mode', '0'))
+        if st.button('Impute Null'):
+            if option == "mean":
+                self.data[col] = self.data[col].fillna(self.data[col].mean())
+            elif option == "mode":
+                self.data[col] = self.data[col].fillna(self.data[col].mode().iloc[0])  # mode() returns a DataFrame, so we select the first row
+            elif option == "0":
+                self.data[col] = self.data[col].fillna(0)
+            st.success("Null values filled")
+            self.data.to_csv("data.csv", index=False)
+        return self.data
+    def remove_columns(self):
+        st.header("Remove Columns")
+        col = st.multiselect('Choose columns to remove', self.data.columns)
+        if st.button('Remove Columns'):
+            self.data.drop(columns=col, inplace=True)
+            st.success("Columns removed")
+        return self.data
+        # PROBLEMS RESOLVED
         #transformed data is not retained
         #null values handling
         #2 options - to remove or to impute that is the question
+        # PROBLEMS TO BE ADDRESSED
+        #categorical to numerical
         #give option to analyse the transformed dataset or save it.

data_visualizer.py CHANGED Viewed

@@ -7,9 +7,11 @@ import seaborn as sns
 class DataVisualizer:
     def __init__(self, data):
         self.data = data
     def visualize_data(self):
         plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
         if plot_type == 'Histogram':
             numeric_columns = self.data.select_dtypes(include=[np.number]).columns
             if numeric_columns.empty:
@@ -18,7 +20,11 @@ class DataVisualizer:
                 column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
                 fig, ax = plt.subplots()
                 ax.hist(self.data[column_to_visualize])
                 st.pyplot(fig)
         elif plot_type == 'Box Plot':
             numeric_columns = self.data.select_dtypes(include=[np.number]).columns
             if numeric_columns.empty:
@@ -27,23 +33,42 @@ class DataVisualizer:
                 column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
                 fig, ax = plt.subplots()
                 ax.boxplot(self.data[column_to_visualize].dropna())
                 st.pyplot(fig)
         elif plot_type == 'Pie Chart':
-            column_to_visualize = st.selectbox('Choose a column to visualize', self.data.select_dtypes(include=['object']).columns)
-            fig, ax = plt.subplots()
-            self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
-            st.pyplot(fig)
         elif plot_type == 'Scatter Plot':
-            columns_to_visualize = st.multiselect('Choose two columns to visualize', self.data.select_dtypes(include=[np.number]).columns)
-            if len(columns_to_visualize) != 2:
-                st.warning('Please select exactly two columns for scatter plot.')
             else:
                 fig, ax = plt.subplots()
-                ax.scatter(self.data[columns_to_visualize[0]], self.data[columns_to_visualize[1]])
                 st.pyplot(fig)
         elif plot_type == 'Heatmap':
             numeric_data = self.data.select_dtypes(include=[np.number])
             corr = numeric_data.corr()
             fig, ax = plt.subplots()
             sns.heatmap(corr, annot=True, ax=ax)
             st.pyplot(fig)

 class DataVisualizer:
     def __init__(self, data):
         self.data = data
+        st.subheader("Data Visualizer")
     def visualize_data(self):
         plot_type = st.selectbox('Choose a type of plot', ['Histogram', 'Box Plot', 'Pie Chart', 'Scatter Plot', 'Heatmap'])
         if plot_type == 'Histogram':
             numeric_columns = self.data.select_dtypes(include=[np.number]).columns
             if numeric_columns.empty:
                 column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
                 fig, ax = plt.subplots()
                 ax.hist(self.data[column_to_visualize])
+                ax.set_title(f'Histogram of {column_to_visualize}')
+                ax.set_xlabel(column_to_visualize)
+                ax.set_ylabel('Frequency')
                 st.pyplot(fig)
         elif plot_type == 'Box Plot':
             numeric_columns = self.data.select_dtypes(include=[np.number]).columns
             if numeric_columns.empty:
                 column_to_visualize = st.selectbox('Choose a column to visualize', numeric_columns)
                 fig, ax = plt.subplots()
                 ax.boxplot(self.data[column_to_visualize].dropna())
+                ax.set_title(f'Box Plot of {column_to_visualize}')
+                ax.set_ylabel(column_to_visualize)
                 st.pyplot(fig)
         elif plot_type == 'Pie Chart':
+            nonnumeric_columns = self.data.select_dtypes(include=['object']).columns
+            if nonnumeric_columns.empty:
+                st.warning('No non numeric columns in the data to visualize.')
+            else:
+                column_to_visualize = st.selectbox('Choose a column to visualize', nonnumeric_columns)
+                fig, ax = plt.subplots()
+                self.data[column_to_visualize].value_counts().plot(kind='pie', ax=ax, autopct='%1.1f%%', textprops={'fontsize': 'small'})
+                ax.set_title(f'Pie Chart of {column_to_visualize}')
+                ax.set_ylabel('')
+                st.pyplot(fig)
         elif plot_type == 'Scatter Plot':
+            left, right = st.columns(2)
+            with left:
+                x_col = st.selectbox('Choose values on X axis', self.data.select_dtypes(include=[np.number]).columns)
+            with right:
+                y_col = st.selectbox('Choose values on Y axis', self.data.select_dtypes(include=[np.number]).columns)
+            if x_col == y_col:
+                st.warning('Please select two different columns for scatter plot.')
             else:
                 fig, ax = plt.subplots()
+                ax.scatter(self.data[x_col], self.data[y_col])
+                ax.set_title(f'Scatter Plot of {x_col} vs {y_col}')
+                ax.set_xlabel(x_col)
+                ax.set_ylabel(y_col)
                 st.pyplot(fig)
         elif plot_type == 'Heatmap':
             numeric_data = self.data.select_dtypes(include=[np.number])
             corr = numeric_data.corr()
             fig, ax = plt.subplots()
             sns.heatmap(corr, annot=True, ax=ax)
+            ax.set_title('Correlation Heatmap')
             st.pyplot(fig)

requirements.txt CHANGED Viewed

@@ -7,4 +7,7 @@ langchain-google-genai
 langchain-experimental
 python-dotenv
 tabulate
-litellm

 langchain-experimental
 python-dotenv
 tabulate
+litellm
+streamlit_option_menu
+pytest

test_app.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from streamlit.testing.v1 import AppTest
+def test_smoke():
+    """Basic smoke test"""
+    at = AppTest.from_file("app.py", default_timeout=10).run()
+    # Supported elements are primarily exposed as properties on the script
+    # results object, which returns a sequence of that element.
+    assert not at.exception